diff --git a/.gitignore b/.gitignore
index 16d03d4a193..0a0a9f2c3fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,6 +88,14 @@ GSYMS
 /tools/openfst-1.3.4/
 /tools/openfst-1.4.1.tar.gz
 /tools/openfst-1.4.1/
+/tools/openfst-1.5.4.tar.gz
+/tools/openfst-1.5.4/
+/tools/openfst-1.6.0.tar.gz
+/tools/openfst-1.6.0/
+/tools/openfst-1.6.1.tar.gz
+/tools/openfst-1.6.1/
+/tools/openfst-1.6.2.tar.gz
+/tools/openfst-1.6.2/
 /tools/pa_stable_v19_20111121.tgz
 /tools/portaudio/
 /tools/sctk-2.4.0-20091110-0958.tar.bz2
@@ -114,6 +122,7 @@ GSYMS
 /tools/pthreads
 /tools/pthreads*.zip
 /tools/sequitur
+/tools/sequitur-g2p
 /tools/srilm.tgz
 /tools/liblbfgs-1.10.tar.gz
 /tools/liblbfgs-1.10/
@@ -124,4 +133,3 @@ GSYMS
 /tools/sequitur-g2p/
 
 /kaldiwin_vs*
-
diff --git a/.travis.yml b/.travis.yml
index 85bbc7a52e4..f8e2bac0362 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,16 +10,19 @@ addons:
   apt:
     sources:
       - ubuntu-toolchain-r-test
+      - llvm-toolchain-precise-3.8
     packages:
       - gdb
       - gcc-4.9
       - g++-4.9
       - gfortran-4.9
       - liblapack-dev
+      - clang-3.8
 
 branches:
   only:
     - master
+    - shortcut
 
 before_install:
   - cat /proc/sys/kernel/core_pattern
@@ -27,7 +30,7 @@ before_install:
   - tools/extras/travis_install_bindeps.sh $XROOT
 
 script:
-  - CXX=g++-4.9
+  - CXX=clang++-3.8
     CFLAGS="-march=native"
     LDFLAGS="-llapack"
     INCDIRS="$XROOT/usr/include"
diff --git a/README.md b/README.md
index 32d4945a909..73abe9f1e3f 100644
--- a/README.md
+++ b/README.md
@@ -40,25 +40,30 @@ Development pattern for contributors
 ------------------------------------
 
 1. [Create a personal fork](https://help.github.com/articles/fork-a-repo/)
-   of the [main Kaldi repository] (https://github.com/kaldi-asr/kaldi) in GitHub.
+   of the [main Kaldi repository](https://github.com/kaldi-asr/kaldi) in GitHub.
 2. Make your changes in a named branch different from `master`, e.g. you create
    a branch `my-awesome-feature`.
 3. [Generate a pull request](https://help.github.com/articles/creating-a-pull-request/)
    through the Web interface of GitHub.
-4. As a general rule, please follow [Google C++ Style Guide]
-   (https://google.github.io/styleguide/cppguide.html).
+4. As a general rule, please follow [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
    There are a [few exceptions in Kaldi](http://kaldi-asr.org/doc/style.html).
-   You can use the [Google's cpplint.py]
-   (https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py)
+   You can use the [Google's cpplint.py](https://raw.githubusercontent.com/google/styleguide/gh-pages/cpplint/cpplint.py)
    to verify that your code is free of basic mistakes.
 
 Platform specific notes
 -----------------------
 
-PowerPC 64bits little-endian (ppc64le):
+### PowerPC 64bits little-endian (ppc64le)
+
 - Kaldi is expected to work out of the box in RHEL >= 7 and Ubuntu >= 16.04 with
   OpenBLAS, ATLAS, or CUDA.
-- CUDA drivers for ppc64le can be found at [https://developer.nvidia.com/cuda-downloads]
-  (https://developer.nvidia.com/cuda-downloads).
-- An [IBM Redbook] (https://www.redbooks.ibm.com/abstracts/redp5169.html) is
+- CUDA drivers for ppc64le can be found at [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
+- An [IBM Redbook](https://www.redbooks.ibm.com/abstracts/redp5169.html) is
   available as a guide to install and configure CUDA.
+
+### Android
+
+- Kaldi supports cross compiling for Android using Android NDK, clang++ and
+  OpenBLAS.
+- See [this blog post](http://jcsilva.github.io/2017/03/18/compile-kaldi-android/)
+  for details.
diff --git a/egs/ami/s5/local/ami_ihm_data_prep.sh b/egs/ami/s5/local/ami_ihm_data_prep.sh
index 3a1d43d1ea1..b3ec1723713 100755
--- a/egs/ami/s5/local/ami_ihm_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_data_prep.sh
@@ -69,7 +69,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
index c3b9914d7a0..b69732a61eb 100755
--- a/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_ihm_scoring_data_prep.sh
@@ -68,7 +68,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5/local/ami_mdm_data_prep.sh b/egs/ami/s5/local/ami_mdm_data_prep.sh
index bc7e4180b4a..2cc973cb2d5 100755
--- a/egs/ami/s5/local/ami_mdm_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_data_prep.sh
@@ -75,7 +75,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 #prep reco2file_and_channel
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
index ab0fd185f70..8d9e24a9838 100755
--- a/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_mdm_scoring_data_prep.sh
@@ -67,7 +67,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp >  $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5/local/ami_sdm_data_prep.sh b/egs/ami/s5/local/ami_sdm_data_prep.sh
index 8eda00f1d15..e662759a610 100755
--- a/egs/ami/s5/local/ami_sdm_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_data_prep.sh
@@ -74,7 +74,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select a single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # this file reco2file_and_channel maps recording-id
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
index 01173d2e3a6..3fa7c938479 100755
--- a/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5/local/ami_sdm_scoring_data_prep.sh
@@ -72,7 +72,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp > $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh b/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
index 24176d69a34..a6c2d02b7af 100755
--- a/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/ami/s5/local/online/run_nnet2_ms_perturbed.sh
@@ -8,7 +8,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -27,13 +27,13 @@ fix_nnet=false
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="--gpu 1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
@@ -52,7 +52,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 dir=exp/$mic/nnet2_online/nnet_ms_sp${affix:+_$affix}
@@ -155,7 +155,7 @@ fi
 wait;
 
 if [ $stage -le 12 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev eval; do
     (
@@ -191,7 +191,7 @@ if [ $stage -le 14 ]; then
       steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
         --per-utt true --online false $graph_dir data/$mic/${decode_set}_hires \
           $decode_dir || exit 1;
-    ) & 
+    ) &
   done
 fi
 wait;
diff --git a/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh b/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
index f8711c24025..8e5fc093807 100755
--- a/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
+++ b/egs/ami/s5/local/online/run_nnet2_ms_sp_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -42,19 +42,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts=" -l gpu=1,hostname='!g01*&!g02*' " #we want to submit to all.q as we use multiple GPUs for this 
+  parallel_opts="--gpu 1" #we want to submit to all.q as we use multiple GPUs for this
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ -z $srcdir ]; then
@@ -76,7 +76,7 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # max total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/$mic/nnet2_online/ivectors_train_hires_sp2 \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.conf \
      data/$mic/train_hires_sp data/lang $srcdir ${srcdir}_denlats || exit 1;
@@ -142,7 +142,7 @@ if [ $stage -le 5 ]; then
       (
         num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
         decode_dir=$dir/decode_epoch${epoch}_${decode_set}_utt
-        
+
         steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
         --per-utt true  --iter epoch$epoch $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1
       ) &
@@ -154,13 +154,13 @@ if [ $stage -le 5 ]; then
       (
         num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
         decode_dir=$dir/decode_epoch${epoch}_${decode_set}_utt_offline
-        
+
         steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
         --per-utt true --online false --iter epoch$epoch $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1
       ) &
     done
   done
-  
+
   wait
 fi
 
diff --git a/egs/ami/s5b/RESULTS_ihm b/egs/ami/s5b/RESULTS_ihm
index 44234fc3fd9..25a60d24cfb 100644
--- a/egs/ami/s5b/RESULTS_ihm
+++ b/egs/ami/s5b/RESULTS_ihm
@@ -40,7 +40,6 @@
 %WER 24.0 | 13098 94470 | 79.4 12.1 8.5 3.4 24.0 57.1 | -0.153 | exp/ihm/nnet3_cleaned/tdnn_sp/decode_dev/ascore_12/dev_hires.ctm.filt.sys
 %WER 25.5 | 12643 89984 | 77.7 14.2 8.2 3.2 25.5 56.4 | -0.139 | exp/ihm/nnet3_cleaned/tdnn_sp/decode_eval/ascore_11/eval_hires.ctm.filt.sys
 
-
 # local/nnet3/run_tdnn.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix ""
 # nnet3 xent TDNN without data cleaning [cleaning makes very small and
 #  inconsistent difference on this dat]
@@ -55,17 +54,21 @@
 %WER 22.4 | 12643 89977 | 80.3 12.5 7.2 2.7 22.4 53.6 | -0.503 | exp/ihm/nnet3_cleaned/lstm_bidirectional_sp/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 ############################################
-
-# local/chain/run_tdnn.sh --mic ihm --stage 12 &
-# cleanup + chain TDNN model
-# for d in exp/ihm/chain_cleaned/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
-%WER 22.5 | 13098 94490 | 80.6 10.8 8.6 3.1 22.5 55.0 | 0.072 | exp/ihm/chain_cleaned/tdnn_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
-%WER 22.5 | 12643 89978 | 80.3 12.5 7.2 2.7 22.5 53.1 | 0.149 | exp/ihm/chain_cleaned/tdnn_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
-
+# cleanup + chain TDNN model.
+# local/chain/run_tdnn.sh --mic ihm --stage 4 &
+# for d in exp/ihm/chain_cleaned/tdnn1d_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 21.7 | 13098 94488 | 81.1 10.4 8.4 2.8 21.7 54.4 | 0.096 | exp/ihm/chain_cleaned/tdnn1d_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.1 | 12643 89979 | 80.5 12.1 7.4 2.6 22.1 52.8 | 0.185 | exp/ihm/chain_cleaned/tdnn1d_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
+
+# cleanup + chain TDNN model. Uses LDA instead of PCA for ivector features.
+# local/chain/tuning/run_tdnn_1b.sh --mic ihm --stage 4 &
+# for d in exp/ihm/chain_cleaned/tdnn1b_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+%WER 22.0 | 13098 94488 | 80.8 10.2 9.0 2.8 22.0 54.7 | 0.102 | exp/ihm/chain_cleaned/tdnn1b_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
+%WER 22.2 | 12643 89968 | 80.3 12.1 7.6 2.6 22.2 52.9 | 0.170 | exp/ihm/chain_cleaned/tdnn1b_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
 # local/chain/run_tdnn.sh --mic ihm --train-set train --gmm tri3 --nnet3-affix "" --stage 12
 # chain TDNN model without cleanup [note: cleanup helps very little on this IHM data.]
-for d in exp/ihm/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
+# for d in exp/ihm/chain/tdnn_sp_bi/decode_*; do grep Sum $d/*sc*/*ys | utils/best_wer.sh; done
 %WER 22.4 | 13098 94476 | 80.4 10.4 9.2 2.8 22.4 54.6 | 0.069 | exp/ihm/chain/tdnn_sp_bi/decode_dev/ascore_10/dev_hires.ctm.filt.sys
 %WER 22.5 | 12643 89974 | 80.0 12.1 7.9 2.6 22.5 52.8 | 0.157 | exp/ihm/chain/tdnn_sp_bi/decode_eval/ascore_10/eval_hires.ctm.filt.sys
 
diff --git a/egs/ami/s5b/local/ami_ihm_data_prep.sh b/egs/ami/s5b/local/ami_ihm_data_prep.sh
index 38f14023b16..8ffa1f1e9c5 100755
--- a/egs/ami/s5b/local/ami_ihm_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_data_prep.sh
@@ -75,7 +75,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
index 3ae42afb3d8..746c42c4c1a 100755
--- a/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_ihm_scoring_data_prep.sh
@@ -74,7 +74,7 @@ sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
 awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp >  $dir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # (1d) reco2file_and_channel
 cat $dir/wav.scp \
diff --git a/egs/ami/s5b/local/ami_mdm_data_prep.sh b/egs/ami/s5b/local/ami_mdm_data_prep.sh
index 0ab11c5893b..d100347a356 100755
--- a/egs/ami/s5b/local/ami_mdm_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_data_prep.sh
@@ -79,7 +79,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 #prep reco2file_and_channel
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
index 4fbfe12ccad..65f514f223c 100755
--- a/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_mdm_scoring_data_prep.sh
@@ -72,7 +72,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp >  $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_sdm_data_prep.sh b/egs/ami/s5b/local/ami_sdm_data_prep.sh
index 267aef75535..327595070a6 100755
--- a/egs/ami/s5b/local/ami_sdm_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_data_prep.sh
@@ -86,7 +86,7 @@ awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
 awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
 
 #replace path with an appropriate sox command that select a single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
 
 # this file reco2file_and_channel maps recording-id
 cat $dir/wav.scp | \
diff --git a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
index d0609e552cd..1378f8b8965 100755
--- a/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
+++ b/egs/ami/s5b/local/ami_sdm_scoring_data_prep.sh
@@ -82,7 +82,7 @@ sed -e 's?.*/??' -e 's?.wav??' $tmpdir/wav.flist | \
 awk '{print $2}' $tmpdir/segments | sort -u | join - $tmpdir/wav1.scp > $tmpdir/wav2.scp
 
 #replace path with an appropriate sox command that select single channel only
-awk '{print $1" sox -c 1 -t wavpcm -s "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
+awk '{print $1" sox -c 1 -t wavpcm -e signed-integer "$2" -t wavpcm - |"}' $tmpdir/wav2.scp > $tmpdir/wav.scp
 
 #prep reco2file_and_channel
 cat $tmpdir/wav.scp | \
diff --git a/egs/ami/s5b/local/chain/run_tdnn.sh b/egs/ami/s5b/local/chain/run_tdnn.sh
index 61f8f499182..e1adaa9346d 120000
--- a/egs/ami/s5b/local/chain/run_tdnn.sh
+++ b/egs/ami/s5b/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1b.sh
\ No newline at end of file
+tuning/run_tdnn_1d.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/run_tdnn_lstm.sh b/egs/ami/s5b/local/chain/run_tdnn_lstm.sh
index 43145248fbd..23906f31954 120000
--- a/egs/ami/s5b/local/chain/run_tdnn_lstm.sh
+++ b/egs/ami/s5b/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1i.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1j.sh
\ No newline at end of file
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..a9f228cb55d
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+# same as 1b but uses PCA instead of
+# LDA features for the ivector extractor.
+
+# Results on 03/27/2017:
+# local/chain/compare_wer_general.sh ihm tdnn1b_sp_bi tdnn1d_sp_bi
+# System                   tdnn1b_sp_bi tdnn1d_sp_bi
+# WER on dev               22.0         21.9
+# WER on eval              22.2         22.3
+# Final train prob        -0.0813472   -0.0807054
+# Final valid prob        -0.132032    -0.133564
+# Final train prob (xent) -1.41543     -1.41951
+# Final valid prob (xent) -1.62316     -1.63021
+
+set -e -o pipefail
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+ivector_transform_type=pca
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --ivector-transform-type "$ivector_transform_type" \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
new file mode 100755
index 00000000000..008060df070
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+
+# 1j is same as 1i but with changes related to fast-lstmp layer
+# changed num-chunk-per-minibatch to be variable
+# added extra_left_context_initial=0
+# and extra_right_context_final=0
+# These changes are similar to those between swbd's run_tdnn_lstm_1{c,d}.sh
+# recipes
+
+# Results with flags : --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned \
+#System            tdnn_lstm1i_sp_bi_ihmali_ld5 tdnn_lstm1j_sp_bi_ihmali_ld5
+#WER on dev        37.6      37.3
+#WER on eval        40.9      40.4
+#Final train prob      -0.114135 -0.118532
+#Final valid prob      -0.245208 -0.245593
+#Final train prob (xent)      -1.47648  -1.48337
+#Final valid prob (xent)      -2.16365  -2.11097
+
+# steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/
+# exp/sdm1/chain_cleaned/tdnn_lstm1i_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.142->-0.131 xent:train/valid[57,86,final]=(-1.78,-1.48,-1.48/-2.22,-2.17,-2.16) logprob:train/valid[57,86,final]=(-0.157,-0.117,-0.114/-0.243,-0.249,-0.245)
+# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.139->-0.130 xent:train/valid[57,86,final]=(-1.82,-1.50,-1.48/-2.18,-2.12,-2.11) logprob:train/valid[57,86,final]=(-0.165,-0.121,-0.119/-0.240,-0.247,-0.246)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1j  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
new file mode 100755
index 00000000000..b8d947d8e92
--- /dev/null
+++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -0,0 +1,302 @@
+#!/bin/bash
+
+# 1k is same as 1j but with  smaller delay on the first lstm layer
+# there is a 37% increase in training time 11hrs vs 8hrs and the gains are modest
+
+# Results with flags :  --mic sdm1 --use-ihm-ali true --train-set train_cleaned  --gmm tri3_cleaned
+#System           tdnn_lstm1j_sp_bi_ihmali_ld5 tdnn_lstm1k_sp_bi_ihmali_ld5
+#WER on dev        37.3      36.9
+#WER on eval        40.4      40.0
+#Final train prob      -0.118532 -0.119421
+#Final valid prob      -0.245593  -0.24915
+#Final train prob (xent)      -1.48337  -1.48024
+#Final valid prob (xent)      -2.11097   -2.1196
+
+#steps/info/chain_dir_info.pl exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/ exp/sdm1/chain_cleaned/tdnn_lstm1k_sp_bi_ihmali_ld5
+# exp/sdm1/chain_cleaned/tdnn_lstm1j_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.139->-0.130 xent:train/valid[57,86,final]=(-1.82,-1.50,-1.48/-2.18,-2.12,-2.11) logprob:train/valid[57,86,final]=(-0.165,-0.121,-0.119/-0.240,-0.247,-0.246)
+# exp/sdm1/chain_cleaned/tdnn_lstm1k_sp_bi_ihmali_ld5/: num-iters=87 nj=2..12 num-params=43.4M dim=40+100->3770 combine=-0.140->-0.130 xent:train/valid[57,86,final]=(-1.81,-1.49,-1.48/-2.19,-2.13,-2.12) logprob:train/valid[57,86,final]=(-0.163,-0.121,-0.119/-0.242,-0.249,-0.249)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+mic=ihm
+nj=30
+min_seg_len=1.55
+use_ihm_ali=false
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+ihm_gmm=tri3  # the gmm for the IHM system (if --use-ihm-ali true).
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tlstm_affix=1k  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+
+# decode options
+extra_left_context=50
+frames_per_chunk=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --mic $mic \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+# Note: the first stage of the following script is stage 8.
+local/nnet3/prepare_lores_feats.sh --stage $stage \
+                                   --mic $mic \
+                                   --nj $nj \
+                                   --min-seg-len $min_seg_len \
+                                   --use-ihm-ali $use_ihm_ali \
+                                   --train-set $train_set
+
+if $use_ihm_ali; then
+  gmm_dir=exp/ihm/${ihm_gmm}
+  ali_dir=exp/${mic}/${ihm_gmm}_ali_${train_set}_sp_comb_ihmdata
+  lores_train_data_dir=data/$mic/${train_set}_ihmdata_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}_ihmdata
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats_ihmdata
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi_ihmali
+  # note: the distinction between when we use the 'ihmdata' suffix versus
+  # 'ihmali' is pretty arbitrary.
+else
+  gmm_dir=exp/${mic}/$gmm
+  ali_dir=exp/${mic}/${gmm}_ali_${train_set}_sp_comb
+  lores_train_data_dir=data/$mic/${train_set}_sp_comb
+  tree_dir=exp/$mic/chain${nnet3_affix}/tree_bi${tree_affix}
+  lat_dir=exp/$mic/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+  dir=exp/$mic/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp_bi
+fi
+
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+
+train_data_dir=data/$mic/${train_set}_sp_hires_comb
+train_ivector_dir=exp/$mic/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+final_lm=`cat data/local/lm/final_lm`
+LM=$final_lm.pr1-7
+
+
+for f in $gmm_dir/final.mdl $lores_train_data_dir/feats.scp \
+   $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 11 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning perturbed, short-segment-combined ${maybe_ihm}data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+     ${lores_train_data_dir} data/lang $gmm_dir $ali_dir
+fi
+
+[ ! -f $ali_dir/ali.1.gz ] && echo  "$0: expected $ali_dir/ali.1.gz to exist" && exit 1
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+xent_regularize=0.1
+
+if [ $stage -le 15 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-1 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+graph_dir=$dir/graph_${LM}
+if [ $stage -le 17 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir
+fi
+
+if [ $stage -le 18 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+
+  for decode_set in dev eval; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nj --cmd "$decode_cmd" \
+          --extra-left-context $extra_left_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1;
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/ami/s5b/local/nnet3/run_ivector_common.sh b/egs/ami/s5b/local/nnet3/run_ivector_common.sh
index bccbb42494c..860009c5ef5 100755
--- a/egs/ami/s5b/local/nnet3/run_ivector_common.sh
+++ b/egs/ami/s5b/local/nnet3/run_ivector_common.sh
@@ -17,8 +17,8 @@ train_set=train   # you might set this to e.g. train_cleaned.
 gmm=tri3          # This specifies a GMM-dir from the features of the type you're training the system on;
                   # it should contain alignments for 'train_set'.
 
-
 num_threads_ubm=32
+ivector_transform_type=lda
 nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/$mic/nnet3_cleaned or whatever.
 
@@ -30,7 +30,7 @@ nnet3_affix=_cleaned     # affix for exp/$mic/nnet3 directory to put iVector stu
 gmmdir=exp/${mic}/${gmm}
 
 
-for f in data/${mic}/${train_set}/feats.scp ${gmmdir}/final.mdl; do
+for f in data/${mic}/${train_set}/feats.scp ; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -110,20 +110,36 @@ if [ $stage -le 4 ]; then
     echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
   fi
 
-  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
-  if [ -e exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl ]; then
-    # we don't want to overwrite old stuff, ask the user to delete it.
-    echo "$0: exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
-    echo " ... please delete and then rerun, or use a later --stage option."
-    exit 1;
-  fi
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
-     --splice-opts "--left-context=3 --right-context=3" \
-     3000 10000 $temp_data_root/${train_set}_hires data/lang \
-      $gmmdir exp/$mic/nnet3${nnet3_affix}/tri5
+  case $ivector_transform_type in
+    lda)
+      if [ ! -f ${gmmdir}/final.mdl ]; then
+        echo "$0: expected file ${gmmdir}/final.mdl to exist"
+        exit 1;
+      fi
+      echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+      if [ -e exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+        # we don't want to overwrite old stuff, ask the user to delete it.
+        echo "$0: exp/$mic/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+        echo " ... please delete and then rerun, or use a later --stage option."
+        exit 1;
+      fi
+      steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        3000 10000 $temp_data_root/${train_set}_hires data/lang \
+        $gmmdir exp/$mic/nnet3${nnet3_affix}/tri5
+      ;;
+    pca)
+      echo "$0: computing a PCA transform from the hires data."
+      steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+        --splice-opts "--left-context=3 --right-context=3" \
+        --max-utts 10000 --subsample 2 \
+        $temp_data_root/${train_set}_hires \
+        exp/$mic/nnet3${nnet3_affix}/tri5
+      ;;
+    *) echo "$0: invalid iVector transform type $ivector_transform_type" && exit 1;
+  esac
 fi
 
-
 if [ $stage -le 5 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
 
diff --git a/egs/aspire/s5/local/multi_condition/decode.sh b/egs/aspire/s5/local/multi_condition/decode.sh
index 566524095a6..b09c4780e71 100755
--- a/egs/aspire/s5/local/multi_condition/decode.sh
+++ b/egs/aspire/s5/local/multi_condition/decode.sh
@@ -47,7 +47,7 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
-  echo "  --parallel-opts <opts>                   # e.g. '-pe smp 4' if you supply --num-threads 4"
+  echo "  --parallel-opts <opts>                   # e.g. '--num-threads 4' if you supply --num-threads 4"
   exit 1;
 fi
 
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
index 3b778b23162..4e34c78255a 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms.sh
@@ -28,7 +28,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 
@@ -47,7 +47,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads"
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
index ad5fba0929f..dc285f28f8e 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_ms_disc.sh
@@ -8,7 +8,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -42,20 +42,20 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1 -q g.q"    
+  parallel_opts="--gpu 1"
   #parallel_opts="$parallel_opts --config conf/queue_no_k20.conf --allow-k20 false"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}/final.mdl ]; then
@@ -70,13 +70,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=70 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_multicondition/ivectors_train \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_rvb_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 fi
@@ -84,8 +84,8 @@ fi
 if [ $stage -le 2 ]; then
   # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
   # get excellent GPU utilization though.]
-  nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using 
-          # throttle control option (--max-jobs-run with qalter) 
+  nj=1500 # this is 6k hours, use more jobs and control the speed dynamically using
+          # throttle control option (--max-jobs-run with qalter)
           # have a high number of jobs because this could take a while, and we might
           # have some stragglers.
   max_jobs_run=200
diff --git a/egs/babel/s5d/conf/common.fullLP b/egs/babel/s5d/conf/common.fullLP
index d203908d3e0..05dea74beb0 100644
--- a/egs/babel/s5d/conf/common.fullLP
+++ b/egs/babel/s5d/conf/common.fullLP
@@ -35,10 +35,10 @@ babel_type=full
 
 use_pitch=true
 
-lmwt_plp_extra_opts=( --min-lmwt 8 --max-lmwt 18 )
+lmwt_plp_extra_opts=( --min-lmwt 9 --max-lmwt 13 )
 lmwt_bnf_extra_opts=( --min-lmwt 15 --max-lmwt 22 )
 lmwt_dnn_extra_opts=( --min-lmwt 10 --max-lmwt 15 )
-lmwt_chain_extra_opts=( --min-lmwt 4 --max-lmwt 22 )
+lmwt_chain_extra_opts=( --min-lmwt 9 --max-lmwt 13 )
 
 dnn_beam=16.0
 dnn_lat_beam=8.5
diff --git a/egs/babel/s5d/conf/common_vars.sh b/egs/babel/s5d/conf/common_vars.sh
index 4a48d2577a8..3d81a3fcc6c 100644
--- a/egs/babel/s5d/conf/common_vars.sh
+++ b/egs/babel/s5d/conf/common_vars.sh
@@ -12,6 +12,7 @@ cer=0
 
 #Declaring here to make the definition inside the language conf files more
 # transparent and nice
+declare -A train_kwlists
 declare -A dev10h_kwlists
 declare -A dev2h_kwlists
 declare -A evalpart1_kwlists
diff --git a/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf b/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf
index 08f849b7605..af1bbb132f7 100644
--- a/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf
+++ b/egs/babel/s5d/conf/lang/104-pashto-fullLP.official.conf
@@ -3,7 +3,7 @@
 
 #speech corpora files location
 train_data_dir=/export/babel/data/104-pashto/release-current/conversational/training
-train_data_list=/export/babel/data/splits/Pashto_Babel104/train.FullLP.list
+train_data_list=./conf/lists/104-pashto/training.list
 train_nj=32
 
 #RADICAL DEV2H data files
@@ -22,7 +22,7 @@ dev2h_nj=18
 
 #Official DEV data files
 dev10h_data_dir=/export/babel/data/104-pashto/release-current/conversational/dev
-dev10h_data_list=/export/babel/data/splits/Pashto_Babel104/dev.list
+dev10h_data_list=./conf/lists/104-pashto/dev.list
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel104b-v0.4bY_conv-dev/IARPA-babel104b-v0.4bY_conv-dev.stm
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel104b-v0.4bY_conv-dev.ecf.xml
 dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel104b-v0.4bY_conv-dev/IARPA-babel104b-v0.4bY_conv-dev.mitllfa3.rttm
diff --git a/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf b/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf
index 6889cb7eb37..d6ae1007ac9 100644
--- a/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf
+++ b/egs/babel/s5d/conf/lang/105-turkish-fullLP.official.conf
@@ -5,7 +5,14 @@
 #speech corpora files location
 train_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/training
 train_data_list=/export/babel/data/splits/Turkish_Babel105/train.fullLP.list
-train_nj=32
+#train_nj=32
+train_ecf_file=./data/train/ecf.train.xml
+train_rttm_file=./exp/tri5/rttm
+train_kwlists=(
+                      [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel105b-v0.4_conv-dev.kwlist.xml
+                      [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel105b-v0.4_conv-dev.kwlist2.xml
+)
+train_nj=64
 
 #RADICAL DEV data files
 dev2h_data_dir=/export/babel/data/105-turkish/release-current-b/conversational/dev
diff --git a/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf b/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf
index 233cd81fffb..b1dd7f5b4f5 100644
--- a/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/305-guarani.FLP.official.conf
@@ -11,11 +11,12 @@ train_nj=32
 #Radical reduced DEV corpora files location
 dev2h_data_dir=/export/babel/data/305-guarani/IARPA-babel305b-v1.0b-build/BABEL_OP3_305/conversational/dev
 dev2h_data_list=./conf/lists/305-guarani//dev.2h.list
-dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.mitllfa3.rttm
-dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.scoring.ecf.xml
-dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.stm
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.mitllfa3.rttm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.scoring.ecf.xml
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -24,11 +25,12 @@ dev2h_subset_ecf=true
 #Official DEV corpora files location
 dev10h_data_dir=/export/babel/data/305-guarani/IARPA-babel305b-v1.0b-build/BABEL_OP3_305/conversational/dev
 dev10h_data_list=./conf/lists/305-guarani//dev.list
-dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.mitllfa3.rttm
-dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.scoring.ecf.xml
-dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.stm
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.mitllfa3.rttm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.scoring.ecf.xml
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0a_conv-dev/IARPA-babel305b-v1.0a_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev/IARPA-babel305b-v1.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel305b-v1.0c_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf b/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf
index 87f82da6b49..15a0264de61 100644
--- a/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/306-igbo.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev/IARPA-babel306b-v2.0c_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel306b-v2.0c_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf b/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf
index 9668bd14e6b..8ae1b53eb2b 100644
--- a/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/307-amharic.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev.kwlist4.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev/IARPA-babel307b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel307b-v1.0b_conv-dev.kwlist4.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
@@ -42,5 +44,9 @@ unsup_nj=32
 lexicon_file=/export/babel/data/307-amharic/IARPA-babel307b-v1.0b-build/BABEL_OP3_307/conversational/reference_materials/lexicon.txt
 lexiconFlags="--romanized --oov <unk>"
 
+extlex_proxy_phone_beam=5
+extlex_proxy_phone_nbest=300
+extlex_proxy_beam=-1
+extlex_proxy_nbest=-1
 
 
diff --git a/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf b/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf
index 902ded164d2..aac78e77a80 100644
--- a/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/401-mongolian.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev/IARPA-babel401b-v2.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel401b-v2.0b_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf b/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf
index 0f176dc9396..d0f86207484 100644
--- a/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/402-javanese.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev.kwlist3.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,9 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
-    [kwlist3]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist3.xml
-
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev/IARPA-babel402b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel402b-v1.0b_conv-dev.kwlist3.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf b/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf
index 6dc95d74304..9096a21fdc4 100644
--- a/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf
+++ b/egs/babel/s5d/conf/lang/403-dholuo.FLP.official.conf
@@ -15,7 +15,8 @@ dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-de
 dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.scoring.ecf.xml
 dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.stm
 dev2h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev.kwlist4.xml
 )  # dev2h_kwlists
 dev2h_nj=16
 dev2h_subset_ecf=true
@@ -28,7 +29,8 @@ dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-d
 dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.scoring.ecf.xml
 dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.stm
 dev10h_kwlists=(
-    [kwlist]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev/IARPA-babel403b-v1.0b_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel403b-v1.0b_conv-dev.kwlist4.xml
 )  # dev10h_kwlists
 dev10h_nj=32
 
diff --git a/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
new file mode 100644
index 00000000000..4c36a8878fd
--- /dev/null
+++ b/egs/babel/s5d/conf/lang/404-georgian.FLP.official.conf
@@ -0,0 +1,78 @@
+# include common settings for fullLP systems.
+. conf/common.fullLP || exit 1;
+
+
+#speech corpora files location
+train_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/training
+train_data_list=./conf/lists/404-georgian//training.list
+train_nj=32
+
+
+#Radical reduced DEV corpora files location
+dev2h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev2h_data_list=./conf/lists/404-georgian//dev.2h.list
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev2h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # dev2h_kwlists
+dev2h_nj=16
+dev2h_subset_ecf=true
+
+
+#Official DEV corpora files location
+dev10h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev10h_data_list=./conf/lists/404-georgian//dev.list
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev10h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # dev10h_kwlists
+dev10h_nj=32
+
+
+#Official EVAL period evaluation data files
+eval_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/eval
+eval_data_list=./conf/lists/404-georgian//eval.list
+eval_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-eval.ecf.xml
+eval_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # eval_kwlists
+eval_nj=32
+
+
+#Shadow data files
+shadow_data_dir=(
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/eval
+) # shadow_data_dir
+shadow_data_list=(
+    ./conf/lists/404-georgian//dev.list
+    ./conf/lists/404-georgian//eval.list
+) # shadow_data_dir
+shadow_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+shadow_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+shadow_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+shadow_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev.kwlist3.xml
+)  # shadow_kwlists
+shadow_nj=32
+
+
+#Unsupervised dataset for FullLP condition
+unsup_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/untranscribed-training
+unsup_data_list=./conf/lists/404-georgian//untranscribed-training.list
+unsup_nj=32
+
+
+lexicon_file=
+lexiconFlags="--romanized --oov <unk>"
+
+
+
diff --git a/egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf b/egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf
new file mode 100644
index 00000000000..570bcab68ec
--- /dev/null
+++ b/egs/babel/s5d/conf/lang/404-georgian.LLP.official.conf
@@ -0,0 +1,54 @@
+# include common settings for fullLP systems.
+. conf/common.limitedLP || exit 1;
+
+
+#speech corpora files location
+train_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/training
+train_data_list=./conf/lists/404-georgian//sub-train.list
+train_nj=32
+
+
+#Radical reduced DEV corpora files location
+dev2h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev2h_data_list=./conf/lists/404-georgian//dev.2h.list
+dev2h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev2h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev2h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev2h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist3.xml
+)  # dev2h_kwlists
+dev2h_nj=16
+dev2h_subset_ecf=true
+
+
+#Official DEV corpora files location
+dev10h_data_dir=/export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/dev
+dev10h_data_list=./conf/lists/404-georgian//dev.list
+dev10h_rttm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.mitllfa3.rttm
+dev10h_ecf_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.scoring.ecf.xml
+dev10h_stm_file=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.stm
+dev10h_kwlists=(
+    [dev]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist.xml
+    [eval]=/export/babel/data/scoring/IndusDB/IARPA-babel404b-v1.0a_conv-dev/IARPA-babel404b-v1.0a_conv-dev.annot.kwlist3.xml
+)  # dev10h_kwlists
+dev10h_nj=32
+
+
+#Unsupervised dataset for LimitedLP condition
+unsup_data_list=(
+    ./conf/lists/404-georgian//untranscribed-training.list
+    ./conf/lists/404-georgian//sub-train.untranscribed.list
+)  # unsup_data_list
+unsup_data_dir=(
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/untranscribed-training
+    /export/babel/data//404-georgian/IARPA-babel404b-v1.0a-build/BABEL_OP3_404/conversational/training
+)  # unsup_data_dir
+unsup_nj=32
+
+
+lexicon_file=
+lexiconFlags="--romanized --oov <unk>"
+
+
+
diff --git a/egs/babel/s5d/conf/lists/404-georgian/dev.2h.list b/egs/babel/s5d/conf/lists/404-georgian/dev.2h.list
new file mode 100644
index 00000000000..a823552044c
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/dev.2h.list
@@ -0,0 +1,124 @@
+BABEL_OP3_404_10184_20141107_212406_inLine
+BABEL_OP3_404_10184_20141107_212406_outLine
+BABEL_OP3_404_12851_20141013_024620_inLine
+BABEL_OP3_404_12851_20141013_024620_outLine
+BABEL_OP3_404_16184_20141020_233508_inLine
+BABEL_OP3_404_16184_20141020_233508_outLine
+BABEL_OP3_404_17165_20141117_063008_inLine
+BABEL_OP3_404_17165_20141117_063008_outLine
+BABEL_OP3_404_17472_20141201_023731_inLine
+BABEL_OP3_404_17472_20141201_023731_outLine
+BABEL_OP3_404_18380_20141118_001754_inLine
+BABEL_OP3_404_18380_20141118_001754_outLine
+BABEL_OP3_404_18939_20141009_063127_inLine
+BABEL_OP3_404_18939_20141009_063127_outLine
+BABEL_OP3_404_22446_20141013_062554_inLine
+BABEL_OP3_404_22446_20141013_062554_outLine
+BABEL_OP3_404_22466_20141018_193129_inLine
+BABEL_OP3_404_22466_20141018_193129_outLine
+BABEL_OP3_404_22494_20141127_221208_inLine
+BABEL_OP3_404_22494_20141127_221208_outLine
+BABEL_OP3_404_22494_20141127_222057_inLine
+BABEL_OP3_404_22494_20141127_222057_outLine
+BABEL_OP3_404_23239_20141127_054155_inLine
+BABEL_OP3_404_23239_20141127_054155_outLine
+BABEL_OP3_404_24253_20150513_212152_inLine
+BABEL_OP3_404_24253_20150513_212152_outLine
+BABEL_OP3_404_24779_20150620_032949_inLine
+BABEL_OP3_404_24779_20150620_032949_outLine
+BABEL_OP3_404_26074_20141120_050650_inLine
+BABEL_OP3_404_26074_20141120_050650_outLine
+BABEL_OP3_404_28419_20141028_024104_inLine
+BABEL_OP3_404_28419_20141028_024104_outLine
+BABEL_OP3_404_33476_20141114_205102_inLine
+BABEL_OP3_404_33476_20141114_205102_outLine
+BABEL_OP3_404_34564_20141211_015413_inLine
+BABEL_OP3_404_34564_20141211_015413_outLine
+BABEL_OP3_404_35467_20141020_054030_inLine
+BABEL_OP3_404_35467_20141020_054030_outLine
+BABEL_OP3_404_38431_20141130_190122_inLine
+BABEL_OP3_404_38431_20141130_190122_outLine
+BABEL_OP3_404_41592_20141117_033328_inLine
+BABEL_OP3_404_41592_20141117_033328_outLine
+BABEL_OP3_404_41741_20141019_015552_inLine
+BABEL_OP3_404_41741_20141019_015552_outLine
+BABEL_OP3_404_42231_20141130_013425_inLine
+BABEL_OP3_404_42231_20141130_013425_outLine
+BABEL_OP3_404_42231_20141130_014628_inLine
+BABEL_OP3_404_42231_20141130_014628_outLine
+BABEL_OP3_404_42600_20141029_174857_inLine
+BABEL_OP3_404_42600_20141029_174857_outLine
+BABEL_OP3_404_44619_20141028_234639_inLine
+BABEL_OP3_404_44619_20141028_234639_outLine
+BABEL_OP3_404_46535_20150216_024618_inLine
+BABEL_OP3_404_46535_20150216_024618_outLine
+BABEL_OP3_404_46757_20141123_021510_inLine
+BABEL_OP3_404_46757_20141123_021510_outLine
+BABEL_OP3_404_47487_20141030_235808_inLine
+BABEL_OP3_404_47487_20141030_235808_outLine
+BABEL_OP3_404_47866_20150526_162411_inLine
+BABEL_OP3_404_47866_20150526_162411_outLine
+BABEL_OP3_404_47959_20141026_214447_inLine
+BABEL_OP3_404_47959_20141026_214447_outLine
+BABEL_OP3_404_51955_20141024_012212_inLine
+BABEL_OP3_404_51955_20141024_012212_outLine
+BABEL_OP3_404_51968_20141117_023015_inLine
+BABEL_OP3_404_51968_20141117_023015_outLine
+BABEL_OP3_404_52804_20141023_174815_inLine
+BABEL_OP3_404_52804_20141023_174815_outLine
+BABEL_OP3_404_54567_20141119_040337_inLine
+BABEL_OP3_404_54567_20141119_040337_outLine
+BABEL_OP3_404_56677_20141201_065523_inLine
+BABEL_OP3_404_56677_20141201_065523_outLine
+BABEL_OP3_404_56826_20141201_042429_inLine
+BABEL_OP3_404_56826_20141201_042429_outLine
+BABEL_OP3_404_58047_20141110_215330_inLine
+BABEL_OP3_404_58047_20141110_215330_outLine
+BABEL_OP3_404_58313_20141119_234202_inLine
+BABEL_OP3_404_58313_20141119_234202_outLine
+BABEL_OP3_404_59549_20141102_190355_inLine
+BABEL_OP3_404_59549_20141102_190355_outLine
+BABEL_OP3_404_60307_20150625_022621_inLine
+BABEL_OP3_404_60307_20150625_022621_outLine
+BABEL_OP3_404_61040_20141211_011552_inLine
+BABEL_OP3_404_61040_20141211_011552_outLine
+BABEL_OP3_404_61190_20141029_013447_inLine
+BABEL_OP3_404_61190_20141029_013447_outLine
+BABEL_OP3_404_64638_20141130_205157_inLine
+BABEL_OP3_404_64638_20141130_205157_outLine
+BABEL_OP3_404_66472_20141107_204602_inLine
+BABEL_OP3_404_66472_20141107_204602_outLine
+BABEL_OP3_404_66519_20141031_015751_inLine
+BABEL_OP3_404_66519_20141031_015751_outLine
+BABEL_OP3_404_67794_20141103_023323_inLine
+BABEL_OP3_404_67794_20141103_023323_outLine
+BABEL_OP3_404_73696_20150618_060036_inLine
+BABEL_OP3_404_73696_20150618_060036_outLine
+BABEL_OP3_404_73757_20141117_025704_inLine
+BABEL_OP3_404_73757_20141117_025704_outLine
+BABEL_OP3_404_74121_20141120_020705_inLine
+BABEL_OP3_404_74121_20141120_020705_outLine
+BABEL_OP3_404_80781_20141104_212234_inLine
+BABEL_OP3_404_80781_20141104_212234_outLine
+BABEL_OP3_404_80881_20141010_222135_inLine
+BABEL_OP3_404_80881_20141010_222135_outLine
+BABEL_OP3_404_81424_20141123_000421_inLine
+BABEL_OP3_404_81424_20141123_000421_outLine
+BABEL_OP3_404_87298_20141025_213601_inLine
+BABEL_OP3_404_87298_20141025_213601_outLine
+BABEL_OP3_404_87313_20141119_014632_inLine
+BABEL_OP3_404_87313_20141119_014632_outLine
+BABEL_OP3_404_87796_20141120_065537_inLine
+BABEL_OP3_404_87796_20141120_065537_outLine
+BABEL_OP3_404_87884_20141128_211555_inLine
+BABEL_OP3_404_87884_20141128_211555_outLine
+BABEL_OP3_404_88776_20141006_193621_inLine
+BABEL_OP3_404_88776_20141006_193621_outLine
+BABEL_OP3_404_91760_20150609_033824_inLine
+BABEL_OP3_404_91760_20150609_033824_outLine
+BABEL_OP3_404_91930_20150522_034521_inLine
+BABEL_OP3_404_91930_20150522_034521_outLine
+BABEL_OP3_404_92740_20141126_025242_inLine
+BABEL_OP3_404_92740_20141126_025242_outLine
+BABEL_OP3_404_97376_20141126_024552_inLine
+BABEL_OP3_404_97376_20141126_024552_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/dev.list b/egs/babel/s5d/conf/lists/404-georgian/dev.list
new file mode 100644
index 00000000000..a823552044c
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/dev.list
@@ -0,0 +1,124 @@
+BABEL_OP3_404_10184_20141107_212406_inLine
+BABEL_OP3_404_10184_20141107_212406_outLine
+BABEL_OP3_404_12851_20141013_024620_inLine
+BABEL_OP3_404_12851_20141013_024620_outLine
+BABEL_OP3_404_16184_20141020_233508_inLine
+BABEL_OP3_404_16184_20141020_233508_outLine
+BABEL_OP3_404_17165_20141117_063008_inLine
+BABEL_OP3_404_17165_20141117_063008_outLine
+BABEL_OP3_404_17472_20141201_023731_inLine
+BABEL_OP3_404_17472_20141201_023731_outLine
+BABEL_OP3_404_18380_20141118_001754_inLine
+BABEL_OP3_404_18380_20141118_001754_outLine
+BABEL_OP3_404_18939_20141009_063127_inLine
+BABEL_OP3_404_18939_20141009_063127_outLine
+BABEL_OP3_404_22446_20141013_062554_inLine
+BABEL_OP3_404_22446_20141013_062554_outLine
+BABEL_OP3_404_22466_20141018_193129_inLine
+BABEL_OP3_404_22466_20141018_193129_outLine
+BABEL_OP3_404_22494_20141127_221208_inLine
+BABEL_OP3_404_22494_20141127_221208_outLine
+BABEL_OP3_404_22494_20141127_222057_inLine
+BABEL_OP3_404_22494_20141127_222057_outLine
+BABEL_OP3_404_23239_20141127_054155_inLine
+BABEL_OP3_404_23239_20141127_054155_outLine
+BABEL_OP3_404_24253_20150513_212152_inLine
+BABEL_OP3_404_24253_20150513_212152_outLine
+BABEL_OP3_404_24779_20150620_032949_inLine
+BABEL_OP3_404_24779_20150620_032949_outLine
+BABEL_OP3_404_26074_20141120_050650_inLine
+BABEL_OP3_404_26074_20141120_050650_outLine
+BABEL_OP3_404_28419_20141028_024104_inLine
+BABEL_OP3_404_28419_20141028_024104_outLine
+BABEL_OP3_404_33476_20141114_205102_inLine
+BABEL_OP3_404_33476_20141114_205102_outLine
+BABEL_OP3_404_34564_20141211_015413_inLine
+BABEL_OP3_404_34564_20141211_015413_outLine
+BABEL_OP3_404_35467_20141020_054030_inLine
+BABEL_OP3_404_35467_20141020_054030_outLine
+BABEL_OP3_404_38431_20141130_190122_inLine
+BABEL_OP3_404_38431_20141130_190122_outLine
+BABEL_OP3_404_41592_20141117_033328_inLine
+BABEL_OP3_404_41592_20141117_033328_outLine
+BABEL_OP3_404_41741_20141019_015552_inLine
+BABEL_OP3_404_41741_20141019_015552_outLine
+BABEL_OP3_404_42231_20141130_013425_inLine
+BABEL_OP3_404_42231_20141130_013425_outLine
+BABEL_OP3_404_42231_20141130_014628_inLine
+BABEL_OP3_404_42231_20141130_014628_outLine
+BABEL_OP3_404_42600_20141029_174857_inLine
+BABEL_OP3_404_42600_20141029_174857_outLine
+BABEL_OP3_404_44619_20141028_234639_inLine
+BABEL_OP3_404_44619_20141028_234639_outLine
+BABEL_OP3_404_46535_20150216_024618_inLine
+BABEL_OP3_404_46535_20150216_024618_outLine
+BABEL_OP3_404_46757_20141123_021510_inLine
+BABEL_OP3_404_46757_20141123_021510_outLine
+BABEL_OP3_404_47487_20141030_235808_inLine
+BABEL_OP3_404_47487_20141030_235808_outLine
+BABEL_OP3_404_47866_20150526_162411_inLine
+BABEL_OP3_404_47866_20150526_162411_outLine
+BABEL_OP3_404_47959_20141026_214447_inLine
+BABEL_OP3_404_47959_20141026_214447_outLine
+BABEL_OP3_404_51955_20141024_012212_inLine
+BABEL_OP3_404_51955_20141024_012212_outLine
+BABEL_OP3_404_51968_20141117_023015_inLine
+BABEL_OP3_404_51968_20141117_023015_outLine
+BABEL_OP3_404_52804_20141023_174815_inLine
+BABEL_OP3_404_52804_20141023_174815_outLine
+BABEL_OP3_404_54567_20141119_040337_inLine
+BABEL_OP3_404_54567_20141119_040337_outLine
+BABEL_OP3_404_56677_20141201_065523_inLine
+BABEL_OP3_404_56677_20141201_065523_outLine
+BABEL_OP3_404_56826_20141201_042429_inLine
+BABEL_OP3_404_56826_20141201_042429_outLine
+BABEL_OP3_404_58047_20141110_215330_inLine
+BABEL_OP3_404_58047_20141110_215330_outLine
+BABEL_OP3_404_58313_20141119_234202_inLine
+BABEL_OP3_404_58313_20141119_234202_outLine
+BABEL_OP3_404_59549_20141102_190355_inLine
+BABEL_OP3_404_59549_20141102_190355_outLine
+BABEL_OP3_404_60307_20150625_022621_inLine
+BABEL_OP3_404_60307_20150625_022621_outLine
+BABEL_OP3_404_61040_20141211_011552_inLine
+BABEL_OP3_404_61040_20141211_011552_outLine
+BABEL_OP3_404_61190_20141029_013447_inLine
+BABEL_OP3_404_61190_20141029_013447_outLine
+BABEL_OP3_404_64638_20141130_205157_inLine
+BABEL_OP3_404_64638_20141130_205157_outLine
+BABEL_OP3_404_66472_20141107_204602_inLine
+BABEL_OP3_404_66472_20141107_204602_outLine
+BABEL_OP3_404_66519_20141031_015751_inLine
+BABEL_OP3_404_66519_20141031_015751_outLine
+BABEL_OP3_404_67794_20141103_023323_inLine
+BABEL_OP3_404_67794_20141103_023323_outLine
+BABEL_OP3_404_73696_20150618_060036_inLine
+BABEL_OP3_404_73696_20150618_060036_outLine
+BABEL_OP3_404_73757_20141117_025704_inLine
+BABEL_OP3_404_73757_20141117_025704_outLine
+BABEL_OP3_404_74121_20141120_020705_inLine
+BABEL_OP3_404_74121_20141120_020705_outLine
+BABEL_OP3_404_80781_20141104_212234_inLine
+BABEL_OP3_404_80781_20141104_212234_outLine
+BABEL_OP3_404_80881_20141010_222135_inLine
+BABEL_OP3_404_80881_20141010_222135_outLine
+BABEL_OP3_404_81424_20141123_000421_inLine
+BABEL_OP3_404_81424_20141123_000421_outLine
+BABEL_OP3_404_87298_20141025_213601_inLine
+BABEL_OP3_404_87298_20141025_213601_outLine
+BABEL_OP3_404_87313_20141119_014632_inLine
+BABEL_OP3_404_87313_20141119_014632_outLine
+BABEL_OP3_404_87796_20141120_065537_inLine
+BABEL_OP3_404_87796_20141120_065537_outLine
+BABEL_OP3_404_87884_20141128_211555_inLine
+BABEL_OP3_404_87884_20141128_211555_outLine
+BABEL_OP3_404_88776_20141006_193621_inLine
+BABEL_OP3_404_88776_20141006_193621_outLine
+BABEL_OP3_404_91760_20150609_033824_inLine
+BABEL_OP3_404_91760_20150609_033824_outLine
+BABEL_OP3_404_91930_20150522_034521_inLine
+BABEL_OP3_404_91930_20150522_034521_outLine
+BABEL_OP3_404_92740_20141126_025242_inLine
+BABEL_OP3_404_92740_20141126_025242_outLine
+BABEL_OP3_404_97376_20141126_024552_inLine
+BABEL_OP3_404_97376_20141126_024552_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/eval.list b/egs/babel/s5d/conf/lists/404-georgian/eval.list
new file mode 100644
index 00000000000..d197b90ee2f
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/eval.list
@@ -0,0 +1,956 @@
+BABEL_OP3_404_10036_20141030_200515_inLine
+BABEL_OP3_404_10036_20141030_200515_outLine
+BABEL_OP3_404_10188_20141021_043537_inLine
+BABEL_OP3_404_10188_20141021_043537_outLine
+BABEL_OP3_404_10319_20141015_010220_inLine
+BABEL_OP3_404_10319_20141015_010220_outLine
+BABEL_OP3_404_10319_20141015_011118_inLine
+BABEL_OP3_404_10319_20141015_011118_outLine
+BABEL_OP3_404_10482_20141130_013900_inLine
+BABEL_OP3_404_10482_20141130_013900_outLine
+BABEL_OP3_404_10524_20150518_002415_inLine
+BABEL_OP3_404_10524_20150518_002415_outLine
+BABEL_OP3_404_10901_20141120_172058_inLine
+BABEL_OP3_404_10901_20141120_172058_outLine
+BABEL_OP3_404_10966_20141027_000701_inLine
+BABEL_OP3_404_10966_20141027_000701_outLine
+BABEL_OP3_404_11419_20150212_050835_inLine
+BABEL_OP3_404_11419_20150212_050835_outLine
+BABEL_OP3_404_11419_20150212_051550_inLine
+BABEL_OP3_404_11419_20150212_051550_outLine
+BABEL_OP3_404_11581_20141110_223927_inLine
+BABEL_OP3_404_11581_20141110_223927_outLine
+BABEL_OP3_404_11797_20141019_195244_inLine
+BABEL_OP3_404_11797_20141019_195244_outLine
+BABEL_OP3_404_12321_20141211_055837_inLine
+BABEL_OP3_404_12321_20141211_055837_outLine
+BABEL_OP3_404_13040_20141024_004921_inLine
+BABEL_OP3_404_13040_20141024_004921_outLine
+BABEL_OP3_404_13427_20141107_220103_inLine
+BABEL_OP3_404_13427_20141107_220103_outLine
+BABEL_OP3_404_13483_20141128_002800_inLine
+BABEL_OP3_404_13483_20141128_002800_outLine
+BABEL_OP3_404_13490_20141118_023408_inLine
+BABEL_OP3_404_13490_20141118_023408_outLine
+BABEL_OP3_404_13561_20141115_003843_inLine
+BABEL_OP3_404_13561_20141115_003843_outLine
+BABEL_OP3_404_13586_20141106_180057_inLine
+BABEL_OP3_404_13586_20141106_180057_outLine
+BABEL_OP3_404_13744_20141021_043037_inLine
+BABEL_OP3_404_13744_20141021_043037_outLine
+BABEL_OP3_404_13792_20141011_010111_inLine
+BABEL_OP3_404_13792_20141011_010111_outLine
+BABEL_OP3_404_14097_20150211_010746_inLine
+BABEL_OP3_404_14097_20150211_010746_outLine
+BABEL_OP3_404_14179_20141201_063636_inLine
+BABEL_OP3_404_14179_20141201_063636_outLine
+BABEL_OP3_404_14228_20141130_062059_inLine
+BABEL_OP3_404_14228_20141130_062059_outLine
+BABEL_OP3_404_14560_20141201_073709_inLine
+BABEL_OP3_404_14560_20141201_073709_outLine
+BABEL_OP3_404_14719_20141201_014614_inLine
+BABEL_OP3_404_14719_20141201_014614_outLine
+BABEL_OP3_404_14725_20141013_005356_inLine
+BABEL_OP3_404_14725_20141013_005356_outLine
+BABEL_OP3_404_15163_20141115_035641_inLine
+BABEL_OP3_404_15163_20141115_035641_outLine
+BABEL_OP3_404_15322_20150512_231817_inLine
+BABEL_OP3_404_15322_20150512_231817_outLine
+BABEL_OP3_404_15324_20141120_031528_inLine
+BABEL_OP3_404_15324_20141120_031528_outLine
+BABEL_OP3_404_15702_20141129_051812_inLine
+BABEL_OP3_404_15702_20141129_051812_outLine
+BABEL_OP3_404_15730_20141021_055606_inLine
+BABEL_OP3_404_15730_20141021_055606_outLine
+BABEL_OP3_404_15926_20141124_004339_inLine
+BABEL_OP3_404_15926_20141124_004339_outLine
+BABEL_OP3_404_15926_20141124_005513_inLine
+BABEL_OP3_404_15926_20141124_005513_outLine
+BABEL_OP3_404_16056_20141009_005123_inLine
+BABEL_OP3_404_16056_20141009_005123_outLine
+BABEL_OP3_404_16787_20141120_174312_inLine
+BABEL_OP3_404_16787_20141120_174312_outLine
+BABEL_OP3_404_16800_20141212_184132_inLine
+BABEL_OP3_404_16800_20141212_184132_outLine
+BABEL_OP3_404_16800_20141212_185849_inLine
+BABEL_OP3_404_16800_20141212_185849_outLine
+BABEL_OP3_404_16886_20141117_002313_inLine
+BABEL_OP3_404_16886_20141117_002313_outLine
+BABEL_OP3_404_16886_20141117_003801_inLine
+BABEL_OP3_404_16886_20141117_003801_outLine
+BABEL_OP3_404_16924_20141201_020122_inLine
+BABEL_OP3_404_16924_20141201_020122_outLine
+BABEL_OP3_404_16938_20141118_045730_inLine
+BABEL_OP3_404_16938_20141118_045730_outLine
+BABEL_OP3_404_17032_20141128_030249_inLine
+BABEL_OP3_404_17032_20141128_030249_outLine
+BABEL_OP3_404_17440_20141127_041844_inLine
+BABEL_OP3_404_17440_20141127_041844_outLine
+BABEL_OP3_404_17496_20141130_022805_inLine
+BABEL_OP3_404_17496_20141130_022805_outLine
+BABEL_OP3_404_17751_20150611_030539_inLine
+BABEL_OP3_404_17751_20150611_030539_outLine
+BABEL_OP3_404_17881_20150524_231317_inLine
+BABEL_OP3_404_17881_20150524_231317_outLine
+BABEL_OP3_404_17914_20150526_054931_inLine
+BABEL_OP3_404_17914_20150526_054931_outLine
+BABEL_OP3_404_18280_20150213_011322_inLine
+BABEL_OP3_404_18280_20150213_011322_outLine
+BABEL_OP3_404_18370_20150210_194727_inLine
+BABEL_OP3_404_18370_20150210_194727_outLine
+BABEL_OP3_404_18924_20141110_211055_inLine
+BABEL_OP3_404_18924_20141110_211055_outLine
+BABEL_OP3_404_19101_20141113_042102_inLine
+BABEL_OP3_404_19101_20141113_042102_outLine
+BABEL_OP3_404_19545_20141107_223152_inLine
+BABEL_OP3_404_19545_20141107_223152_outLine
+BABEL_OP3_404_19621_20141201_041129_inLine
+BABEL_OP3_404_19621_20141201_041129_outLine
+BABEL_OP3_404_19672_20141124_015046_inLine
+BABEL_OP3_404_19672_20141124_015046_outLine
+BABEL_OP3_404_19722_20141006_033717_inLine
+BABEL_OP3_404_19722_20141006_033717_outLine
+BABEL_OP3_404_19782_20141201_231608_inLine
+BABEL_OP3_404_19782_20141201_231608_outLine
+BABEL_OP3_404_19818_20141124_044516_inLine
+BABEL_OP3_404_19818_20141124_044516_outLine
+BABEL_OP3_404_20367_20150618_055644_inLine
+BABEL_OP3_404_20367_20150618_055644_outLine
+BABEL_OP3_404_20682_20141211_044056_inLine
+BABEL_OP3_404_20682_20141211_044056_outLine
+BABEL_OP3_404_20682_20141211_045257_inLine
+BABEL_OP3_404_20682_20141211_045257_outLine
+BABEL_OP3_404_20738_20150503_191409_inLine
+BABEL_OP3_404_20738_20150503_191409_outLine
+BABEL_OP3_404_20768_20141207_081305_inLine
+BABEL_OP3_404_20768_20141207_081305_outLine
+BABEL_OP3_404_20800_20141022_192312_inLine
+BABEL_OP3_404_20800_20141022_192312_outLine
+BABEL_OP3_404_20916_20141006_192451_inLine
+BABEL_OP3_404_20916_20141006_192451_outLine
+BABEL_OP3_404_21029_20141105_033902_inLine
+BABEL_OP3_404_21029_20141105_033902_outLine
+BABEL_OP3_404_21206_20141024_194128_inLine
+BABEL_OP3_404_21206_20141024_194128_outLine
+BABEL_OP3_404_21624_20150525_034841_inLine
+BABEL_OP3_404_21624_20150525_034841_outLine
+BABEL_OP3_404_21794_20141115_220258_inLine
+BABEL_OP3_404_21794_20141115_220258_outLine
+BABEL_OP3_404_22021_20150217_213437_inLine
+BABEL_OP3_404_22021_20150217_213437_outLine
+BABEL_OP3_404_22021_20150220_194248_inLine
+BABEL_OP3_404_22021_20150220_194248_outLine
+BABEL_OP3_404_22034_20150211_165126_inLine
+BABEL_OP3_404_22034_20150211_165126_outLine
+BABEL_OP3_404_22170_20150528_002541_inLine
+BABEL_OP3_404_22170_20150528_002541_outLine
+BABEL_OP3_404_22216_20141020_051333_inLine
+BABEL_OP3_404_22216_20141020_051333_outLine
+BABEL_OP3_404_22321_20141019_214812_inLine
+BABEL_OP3_404_22321_20141019_214812_outLine
+BABEL_OP3_404_22612_20141201_080517_inLine
+BABEL_OP3_404_22612_20141201_080517_outLine
+BABEL_OP3_404_22641_20141021_165119_inLine
+BABEL_OP3_404_22641_20141021_165119_outLine
+BABEL_OP3_404_22965_20141101_192617_inLine
+BABEL_OP3_404_22965_20141101_192617_outLine
+BABEL_OP3_404_23006_20141026_211155_inLine
+BABEL_OP3_404_23006_20141026_211155_outLine
+BABEL_OP3_404_23092_20141129_005335_inLine
+BABEL_OP3_404_23092_20141129_005335_outLine
+BABEL_OP3_404_23153_20141118_015224_inLine
+BABEL_OP3_404_23153_20141118_015224_outLine
+BABEL_OP3_404_23628_20141027_170345_inLine
+BABEL_OP3_404_23628_20141027_170345_outLine
+BABEL_OP3_404_24017_20141211_021947_inLine
+BABEL_OP3_404_24017_20141211_021947_outLine
+BABEL_OP3_404_24290_20150515_164252_inLine
+BABEL_OP3_404_24290_20150515_164252_outLine
+BABEL_OP3_404_24569_20141130_214924_inLine
+BABEL_OP3_404_24569_20141130_214924_outLine
+BABEL_OP3_404_24605_20141013_043620_inLine
+BABEL_OP3_404_24605_20141013_043620_outLine
+BABEL_OP3_404_25698_20150611_021501_inLine
+BABEL_OP3_404_25698_20150611_021501_outLine
+BABEL_OP3_404_25767_20141009_211814_inLine
+BABEL_OP3_404_25767_20141009_211814_outLine
+BABEL_OP3_404_26206_20141128_031139_inLine
+BABEL_OP3_404_26206_20141128_031139_outLine
+BABEL_OP3_404_26999_20141130_004320_inLine
+BABEL_OP3_404_26999_20141130_004320_outLine
+BABEL_OP3_404_27082_20141119_041436_inLine
+BABEL_OP3_404_27082_20141119_041436_outLine
+BABEL_OP3_404_27125_20141007_032335_inLine
+BABEL_OP3_404_27125_20141007_032335_outLine
+BABEL_OP3_404_27478_20150514_205232_inLine
+BABEL_OP3_404_27478_20150514_205232_outLine
+BABEL_OP3_404_28422_20141124_055809_inLine
+BABEL_OP3_404_28422_20141124_055809_outLine
+BABEL_OP3_404_28606_20141127_011719_inLine
+BABEL_OP3_404_28606_20141127_011719_outLine
+BABEL_OP3_404_28775_20141028_193907_inLine
+BABEL_OP3_404_28775_20141028_193907_outLine
+BABEL_OP3_404_29023_20141024_225827_inLine
+BABEL_OP3_404_29023_20141024_225827_outLine
+BABEL_OP3_404_29072_20141128_023212_inLine
+BABEL_OP3_404_29072_20141128_023212_outLine
+BABEL_OP3_404_29135_20141022_182050_inLine
+BABEL_OP3_404_29135_20141022_182050_outLine
+BABEL_OP3_404_29168_20141023_013832_inLine
+BABEL_OP3_404_29168_20141023_013832_outLine
+BABEL_OP3_404_29352_20150618_035033_inLine
+BABEL_OP3_404_29352_20150618_035033_outLine
+BABEL_OP3_404_29352_20150618_041025_inLine
+BABEL_OP3_404_29352_20150618_041025_outLine
+BABEL_OP3_404_29685_20141103_223309_inLine
+BABEL_OP3_404_29685_20141103_223309_outLine
+BABEL_OP3_404_29765_20150616_155830_inLine
+BABEL_OP3_404_29765_20150616_155830_outLine
+BABEL_OP3_404_30013_20141127_211853_inLine
+BABEL_OP3_404_30013_20141127_211853_outLine
+BABEL_OP3_404_30058_20150514_024957_inLine
+BABEL_OP3_404_30058_20150514_024957_outLine
+BABEL_OP3_404_30180_20141118_011806_inLine
+BABEL_OP3_404_30180_20141118_011806_outLine
+BABEL_OP3_404_30253_20141201_051926_inLine
+BABEL_OP3_404_30253_20141201_051926_outLine
+BABEL_OP3_404_30395_20141106_185545_inLine
+BABEL_OP3_404_30395_20141106_185545_outLine
+BABEL_OP3_404_31039_20150217_050120_inLine
+BABEL_OP3_404_31039_20150217_050120_outLine
+BABEL_OP3_404_31039_20150217_051317_inLine
+BABEL_OP3_404_31039_20150217_051317_outLine
+BABEL_OP3_404_31074_20150121_022649_inLine
+BABEL_OP3_404_31074_20150121_022649_outLine
+BABEL_OP3_404_31184_20141118_183536_inLine
+BABEL_OP3_404_31184_20141118_183536_outLine
+BABEL_OP3_404_31490_20141022_200135_inLine
+BABEL_OP3_404_31490_20141022_200135_outLine
+BABEL_OP3_404_31583_20141130_004731_inLine
+BABEL_OP3_404_31583_20141130_004731_outLine
+BABEL_OP3_404_31628_20141202_000346_inLine
+BABEL_OP3_404_31628_20141202_000346_outLine
+BABEL_OP3_404_32097_20141006_221638_inLine
+BABEL_OP3_404_32097_20141006_221638_outLine
+BABEL_OP3_404_32244_20150609_043200_inLine
+BABEL_OP3_404_32244_20150609_043200_outLine
+BABEL_OP3_404_32301_20141126_204138_inLine
+BABEL_OP3_404_32301_20141126_204138_outLine
+BABEL_OP3_404_33111_20150528_004829_inLine
+BABEL_OP3_404_33111_20150528_004829_outLine
+BABEL_OP3_404_33251_20141119_205146_inLine
+BABEL_OP3_404_33251_20141119_205146_outLine
+BABEL_OP3_404_33273_20141105_213401_inLine
+BABEL_OP3_404_33273_20141105_213401_outLine
+BABEL_OP3_404_33497_20141119_051436_inLine
+BABEL_OP3_404_33497_20141119_051436_outLine
+BABEL_OP3_404_33635_20141106_005750_inLine
+BABEL_OP3_404_33635_20141106_005750_outLine
+BABEL_OP3_404_33672_20141014_004055_inLine
+BABEL_OP3_404_33672_20141014_004055_outLine
+BABEL_OP3_404_33672_20141014_005233_inLine
+BABEL_OP3_404_33672_20141014_005233_outLine
+BABEL_OP3_404_33951_20141119_072531_inLine
+BABEL_OP3_404_33951_20141119_072531_outLine
+BABEL_OP3_404_34197_20141018_201528_inLine
+BABEL_OP3_404_34197_20141018_201528_outLine
+BABEL_OP3_404_34336_20141027_211535_inLine
+BABEL_OP3_404_34336_20141027_211535_outLine
+BABEL_OP3_404_34477_20141027_184645_inLine
+BABEL_OP3_404_34477_20141027_184645_outLine
+BABEL_OP3_404_34903_20141124_020719_inLine
+BABEL_OP3_404_34903_20141124_020719_outLine
+BABEL_OP3_404_35139_20141023_224322_inLine
+BABEL_OP3_404_35139_20141023_224322_outLine
+BABEL_OP3_404_35202_20141128_053756_inLine
+BABEL_OP3_404_35202_20141128_053756_outLine
+BABEL_OP3_404_35885_20150518_015426_inLine
+BABEL_OP3_404_35885_20150518_015426_outLine
+BABEL_OP3_404_36293_20141006_004659_inLine
+BABEL_OP3_404_36293_20141006_004659_outLine
+BABEL_OP3_404_36341_20141021_045218_inLine
+BABEL_OP3_404_36341_20141021_045218_outLine
+BABEL_OP3_404_36669_20141116_050542_inLine
+BABEL_OP3_404_36669_20141116_050542_outLine
+BABEL_OP3_404_36894_20141009_013557_inLine
+BABEL_OP3_404_36894_20141009_013557_outLine
+BABEL_OP3_404_36990_20141117_041052_inLine
+BABEL_OP3_404_36990_20141117_041052_outLine
+BABEL_OP3_404_37068_20150212_050250_inLine
+BABEL_OP3_404_37068_20150212_050250_outLine
+BABEL_OP3_404_37285_20141128_060822_inLine
+BABEL_OP3_404_37285_20141128_060822_outLine
+BABEL_OP3_404_37684_20150211_031551_inLine
+BABEL_OP3_404_37684_20150211_031551_outLine
+BABEL_OP3_404_38076_20141129_030136_inLine
+BABEL_OP3_404_38076_20141129_030136_outLine
+BABEL_OP3_404_38689_20141128_235841_inLine
+BABEL_OP3_404_38689_20141128_235841_outLine
+BABEL_OP3_404_38741_20141028_190310_inLine
+BABEL_OP3_404_38741_20141028_190310_outLine
+BABEL_OP3_404_38750_20141130_052516_inLine
+BABEL_OP3_404_38750_20141130_052516_outLine
+BABEL_OP3_404_38878_20141118_224023_inLine
+BABEL_OP3_404_38878_20141118_224023_outLine
+BABEL_OP3_404_39006_20150617_032943_inLine
+BABEL_OP3_404_39006_20150617_032943_outLine
+BABEL_OP3_404_39159_20141021_033733_inLine
+BABEL_OP3_404_39159_20141021_033733_outLine
+BABEL_OP3_404_39848_20141113_234103_inLine
+BABEL_OP3_404_39848_20141113_234103_outLine
+BABEL_OP3_404_40565_20141126_191549_inLine
+BABEL_OP3_404_40565_20141126_191549_outLine
+BABEL_OP3_404_41038_20141201_070557_inLine
+BABEL_OP3_404_41038_20141201_070557_outLine
+BABEL_OP3_404_41174_20141117_033354_inLine
+BABEL_OP3_404_41174_20141117_033354_outLine
+BABEL_OP3_404_41442_20141201_065524_inLine
+BABEL_OP3_404_41442_20141201_065524_outLine
+BABEL_OP3_404_41469_20141015_041032_inLine
+BABEL_OP3_404_41469_20141015_041032_outLine
+BABEL_OP3_404_41493_20141007_192601_inLine
+BABEL_OP3_404_41493_20141007_192601_outLine
+BABEL_OP3_404_41618_20141114_232533_inLine
+BABEL_OP3_404_41618_20141114_232533_outLine
+BABEL_OP3_404_41890_20150516_214915_inLine
+BABEL_OP3_404_41890_20150516_214915_outLine
+BABEL_OP3_404_42146_20150524_225524_inLine
+BABEL_OP3_404_42146_20150524_225524_outLine
+BABEL_OP3_404_42434_20141101_015900_inLine
+BABEL_OP3_404_42434_20141101_015900_outLine
+BABEL_OP3_404_42718_20150514_042601_inLine
+BABEL_OP3_404_42718_20150514_042601_outLine
+BABEL_OP3_404_42771_20141119_032738_inLine
+BABEL_OP3_404_42771_20141119_032738_outLine
+BABEL_OP3_404_42942_20141105_231330_inLine
+BABEL_OP3_404_42942_20141105_231330_outLine
+BABEL_OP3_404_42991_20141201_174138_inLine
+BABEL_OP3_404_42991_20141201_174138_outLine
+BABEL_OP3_404_43115_20150518_051249_inLine
+BABEL_OP3_404_43115_20150518_051249_outLine
+BABEL_OP3_404_43285_20141127_224948_inLine
+BABEL_OP3_404_43285_20141127_224948_outLine
+BABEL_OP3_404_43286_20141011_233252_inLine
+BABEL_OP3_404_43286_20141011_233252_outLine
+BABEL_OP3_404_43646_20141011_031534_inLine
+BABEL_OP3_404_43646_20141011_031534_outLine
+BABEL_OP3_404_43784_20141101_215816_inLine
+BABEL_OP3_404_43784_20141101_215816_outLine
+BABEL_OP3_404_43784_20141101_220445_inLine
+BABEL_OP3_404_43784_20141101_220445_outLine
+BABEL_OP3_404_43784_20141101_222312_inLine
+BABEL_OP3_404_43784_20141101_222312_outLine
+BABEL_OP3_404_43788_20141125_190621_inLine
+BABEL_OP3_404_43788_20141125_190621_outLine
+BABEL_OP3_404_43920_20141128_232903_inLine
+BABEL_OP3_404_43920_20141128_232903_outLine
+BABEL_OP3_404_44255_20150525_073716_inLine
+BABEL_OP3_404_44255_20150525_073716_outLine
+BABEL_OP3_404_44420_20141025_211032_inLine
+BABEL_OP3_404_44420_20141025_211032_outLine
+BABEL_OP3_404_44531_20150527_015805_inLine
+BABEL_OP3_404_44531_20150527_015805_outLine
+BABEL_OP3_404_44709_20141126_024811_inLine
+BABEL_OP3_404_44709_20141126_024811_outLine
+BABEL_OP3_404_44868_20141123_032254_inLine
+BABEL_OP3_404_44868_20141123_032254_outLine
+BABEL_OP3_404_45642_20141011_233950_inLine
+BABEL_OP3_404_45642_20141011_233950_outLine
+BABEL_OP3_404_45770_20141009_185730_inLine
+BABEL_OP3_404_45770_20141009_185730_outLine
+BABEL_OP3_404_45777_20141028_195713_inLine
+BABEL_OP3_404_45777_20141028_195713_outLine
+BABEL_OP3_404_45843_20141124_042608_inLine
+BABEL_OP3_404_45843_20141124_042608_outLine
+BABEL_OP3_404_46008_20150525_024936_inLine
+BABEL_OP3_404_46008_20150525_024936_outLine
+BABEL_OP3_404_46261_20141117_200301_inLine
+BABEL_OP3_404_46261_20141117_200301_outLine
+BABEL_OP3_404_46389_20150216_043700_inLine
+BABEL_OP3_404_46389_20150216_043700_outLine
+BABEL_OP3_404_46558_20141020_013256_inLine
+BABEL_OP3_404_46558_20141020_013256_outLine
+BABEL_OP3_404_46589_20141126_010932_inLine
+BABEL_OP3_404_46589_20141126_010932_outLine
+BABEL_OP3_404_46702_20141021_004925_inLine
+BABEL_OP3_404_46702_20141021_004925_outLine
+BABEL_OP3_404_47110_20150211_041423_inLine
+BABEL_OP3_404_47110_20150211_041423_outLine
+BABEL_OP3_404_47186_20141130_032126_inLine
+BABEL_OP3_404_47186_20141130_032126_outLine
+BABEL_OP3_404_47215_20141016_012848_inLine
+BABEL_OP3_404_47215_20141016_012848_outLine
+BABEL_OP3_404_47283_20141105_063730_inLine
+BABEL_OP3_404_47283_20141105_063730_outLine
+BABEL_OP3_404_47451_20141201_044107_inLine
+BABEL_OP3_404_47451_20141201_044107_outLine
+BABEL_OP3_404_47451_20141201_045923_inLine
+BABEL_OP3_404_47451_20141201_045923_outLine
+BABEL_OP3_404_47878_20141115_030044_inLine
+BABEL_OP3_404_47878_20141115_030044_outLine
+BABEL_OP3_404_48789_20141130_013950_inLine
+BABEL_OP3_404_48789_20141130_013950_outLine
+BABEL_OP3_404_49001_20141102_054949_inLine
+BABEL_OP3_404_49001_20141102_054949_outLine
+BABEL_OP3_404_49216_20141023_021720_inLine
+BABEL_OP3_404_49216_20141023_021720_outLine
+BABEL_OP3_404_49287_20141201_003931_inLine
+BABEL_OP3_404_49287_20141201_003931_outLine
+BABEL_OP3_404_49502_20141012_055001_inLine
+BABEL_OP3_404_49502_20141012_055001_outLine
+BABEL_OP3_404_49637_20141006_052951_inLine
+BABEL_OP3_404_49637_20141006_052951_outLine
+BABEL_OP3_404_50090_20141119_215921_inLine
+BABEL_OP3_404_50090_20141119_215921_outLine
+BABEL_OP3_404_50427_20141108_184045_inLine
+BABEL_OP3_404_50427_20141108_184045_outLine
+BABEL_OP3_404_50630_20141123_224108_inLine
+BABEL_OP3_404_50630_20141123_224108_outLine
+BABEL_OP3_404_50681_20141119_074034_inLine
+BABEL_OP3_404_50681_20141119_074034_outLine
+BABEL_OP3_404_50726_20141021_005526_inLine
+BABEL_OP3_404_50726_20141021_005526_outLine
+BABEL_OP3_404_50958_20141118_184358_inLine
+BABEL_OP3_404_50958_20141118_184358_outLine
+BABEL_OP3_404_50958_20141118_185604_inLine
+BABEL_OP3_404_50958_20141118_185604_outLine
+BABEL_OP3_404_50962_20141107_060744_inLine
+BABEL_OP3_404_50962_20141107_060744_outLine
+BABEL_OP3_404_51407_20141117_062029_inLine
+BABEL_OP3_404_51407_20141117_062029_outLine
+BABEL_OP3_404_51611_20141022_024919_inLine
+BABEL_OP3_404_51611_20141022_024919_outLine
+BABEL_OP3_404_51819_20141126_211917_inLine
+BABEL_OP3_404_51819_20141126_211917_outLine
+BABEL_OP3_404_52272_20141006_031940_inLine
+BABEL_OP3_404_52272_20141006_031940_outLine
+BABEL_OP3_404_52438_20141104_034612_inLine
+BABEL_OP3_404_52438_20141104_034612_outLine
+BABEL_OP3_404_52442_20141109_004908_inLine
+BABEL_OP3_404_52442_20141109_004908_outLine
+BABEL_OP3_404_52614_20150503_200805_inLine
+BABEL_OP3_404_52614_20150503_200805_outLine
+BABEL_OP3_404_52694_20141121_043410_inLine
+BABEL_OP3_404_52694_20141121_043410_outLine
+BABEL_OP3_404_52717_20141014_234034_inLine
+BABEL_OP3_404_52717_20141014_234034_outLine
+BABEL_OP3_404_52818_20141130_231525_inLine
+BABEL_OP3_404_52818_20141130_231525_outLine
+BABEL_OP3_404_52932_20141101_234724_inLine
+BABEL_OP3_404_52932_20141101_234724_outLine
+BABEL_OP3_404_53419_20141201_030819_inLine
+BABEL_OP3_404_53419_20141201_030819_outLine
+BABEL_OP3_404_53842_20141119_044935_inLine
+BABEL_OP3_404_53842_20141119_044935_outLine
+BABEL_OP3_404_54074_20141129_060147_inLine
+BABEL_OP3_404_54074_20141129_060147_outLine
+BABEL_OP3_404_54162_20141119_032442_inLine
+BABEL_OP3_404_54162_20141119_032442_outLine
+BABEL_OP3_404_54390_20141028_230702_inLine
+BABEL_OP3_404_54390_20141028_230702_outLine
+BABEL_OP3_404_54530_20141130_011651_inLine
+BABEL_OP3_404_54530_20141130_011651_outLine
+BABEL_OP3_404_54697_20141201_053854_inLine
+BABEL_OP3_404_54697_20141201_053854_outLine
+BABEL_OP3_404_54953_20141115_022411_inLine
+BABEL_OP3_404_54953_20141115_022411_outLine
+BABEL_OP3_404_55742_20141102_071943_inLine
+BABEL_OP3_404_55742_20141102_071943_outLine
+BABEL_OP3_404_55818_20141014_062259_inLine
+BABEL_OP3_404_55818_20141014_062259_outLine
+BABEL_OP3_404_55950_20150502_234657_inLine
+BABEL_OP3_404_55950_20150502_234657_outLine
+BABEL_OP3_404_55968_20141009_231223_inLine
+BABEL_OP3_404_55968_20141009_231223_outLine
+BABEL_OP3_404_56090_20141019_172050_inLine
+BABEL_OP3_404_56090_20141019_172050_outLine
+BABEL_OP3_404_56198_20141103_031752_inLine
+BABEL_OP3_404_56198_20141103_031752_outLine
+BABEL_OP3_404_56307_20141201_210608_inLine
+BABEL_OP3_404_56307_20141201_210608_outLine
+BABEL_OP3_404_56370_20141010_013542_inLine
+BABEL_OP3_404_56370_20141010_013542_outLine
+BABEL_OP3_404_56429_20141024_003551_inLine
+BABEL_OP3_404_56429_20141024_003551_outLine
+BABEL_OP3_404_56523_20141114_215534_inLine
+BABEL_OP3_404_56523_20141114_215534_outLine
+BABEL_OP3_404_56720_20141129_182808_inLine
+BABEL_OP3_404_56720_20141129_182808_outLine
+BABEL_OP3_404_56720_20141129_183649_inLine
+BABEL_OP3_404_56720_20141129_183649_outLine
+BABEL_OP3_404_57093_20141118_034107_inLine
+BABEL_OP3_404_57093_20141118_034107_outLine
+BABEL_OP3_404_57116_20141008_023139_inLine
+BABEL_OP3_404_57116_20141008_023139_outLine
+BABEL_OP3_404_57529_20141201_050129_inLine
+BABEL_OP3_404_57529_20141201_050129_outLine
+BABEL_OP3_404_57548_20141119_194430_inLine
+BABEL_OP3_404_57548_20141119_194430_outLine
+BABEL_OP3_404_57609_20141117_063904_inLine
+BABEL_OP3_404_57609_20141117_063904_outLine
+BABEL_OP3_404_57609_20141119_223552_inLine
+BABEL_OP3_404_57609_20141119_223552_outLine
+BABEL_OP3_404_57922_20141119_172249_inLine
+BABEL_OP3_404_57922_20141119_172249_outLine
+BABEL_OP3_404_57935_20141122_233816_inLine
+BABEL_OP3_404_57935_20141122_233816_outLine
+BABEL_OP3_404_58107_20141107_223929_inLine
+BABEL_OP3_404_58107_20141107_223929_outLine
+BABEL_OP3_404_58145_20141120_014653_inLine
+BABEL_OP3_404_58145_20141120_014653_outLine
+BABEL_OP3_404_58489_20141201_035927_inLine
+BABEL_OP3_404_58489_20141201_035927_outLine
+BABEL_OP3_404_58717_20141106_221300_inLine
+BABEL_OP3_404_58717_20141106_221300_outLine
+BABEL_OP3_404_58734_20141019_223233_inLine
+BABEL_OP3_404_58734_20141019_223233_outLine
+BABEL_OP3_404_58815_20141129_230108_inLine
+BABEL_OP3_404_58815_20141129_230108_outLine
+BABEL_OP3_404_58821_20141128_224222_inLine
+BABEL_OP3_404_58821_20141128_224222_outLine
+BABEL_OP3_404_58850_20141116_234915_inLine
+BABEL_OP3_404_58850_20141116_234915_outLine
+BABEL_OP3_404_58926_20141105_025457_inLine
+BABEL_OP3_404_58926_20141105_025457_outLine
+BABEL_OP3_404_59163_20150212_233430_inLine
+BABEL_OP3_404_59163_20150212_233430_outLine
+BABEL_OP3_404_59291_20141129_223855_inLine
+BABEL_OP3_404_59291_20141129_223855_outLine
+BABEL_OP3_404_59509_20141120_010036_inLine
+BABEL_OP3_404_59509_20141120_010036_outLine
+BABEL_OP3_404_59747_20141020_002625_inLine
+BABEL_OP3_404_59747_20141020_002625_outLine
+BABEL_OP3_404_59928_20141107_063850_inLine
+BABEL_OP3_404_59928_20141107_063850_outLine
+BABEL_OP3_404_59993_20141102_204023_inLine
+BABEL_OP3_404_59993_20141102_204023_outLine
+BABEL_OP3_404_60115_20141123_045055_inLine
+BABEL_OP3_404_60115_20141123_045055_outLine
+BABEL_OP3_404_60418_20141201_012853_inLine
+BABEL_OP3_404_60418_20141201_012853_outLine
+BABEL_OP3_404_60538_20141010_000421_inLine
+BABEL_OP3_404_60538_20141010_000421_outLine
+BABEL_OP3_404_60661_20141023_185331_inLine
+BABEL_OP3_404_60661_20141023_185331_outLine
+BABEL_OP3_404_60830_20141119_050849_inLine
+BABEL_OP3_404_60830_20141119_050849_outLine
+BABEL_OP3_404_60836_20141026_014449_inLine
+BABEL_OP3_404_60836_20141026_014449_outLine
+BABEL_OP3_404_61011_20141022_235244_inLine
+BABEL_OP3_404_61011_20141022_235244_outLine
+BABEL_OP3_404_61357_20141118_052326_inLine
+BABEL_OP3_404_61357_20141118_052326_outLine
+BABEL_OP3_404_61731_20141026_185743_inLine
+BABEL_OP3_404_61731_20141026_185743_outLine
+BABEL_OP3_404_62014_20141120_021455_inLine
+BABEL_OP3_404_62014_20141120_021455_outLine
+BABEL_OP3_404_62177_20150503_025324_inLine
+BABEL_OP3_404_62177_20150503_025324_outLine
+BABEL_OP3_404_62200_20141115_024033_inLine
+BABEL_OP3_404_62200_20141115_024033_outLine
+BABEL_OP3_404_62289_20150526_045908_inLine
+BABEL_OP3_404_62289_20150526_045908_outLine
+BABEL_OP3_404_62430_20150526_181036_inLine
+BABEL_OP3_404_62430_20150526_181036_outLine
+BABEL_OP3_404_62434_20141019_201121_inLine
+BABEL_OP3_404_62434_20141019_201121_outLine
+BABEL_OP3_404_62656_20150119_185511_inLine
+BABEL_OP3_404_62656_20150119_185511_outLine
+BABEL_OP3_404_62800_20141020_020318_inLine
+BABEL_OP3_404_62800_20141020_020318_outLine
+BABEL_OP3_404_62835_20141119_043323_inLine
+BABEL_OP3_404_62835_20141119_043323_outLine
+BABEL_OP3_404_62976_20141119_061748_inLine
+BABEL_OP3_404_62976_20141119_061748_outLine
+BABEL_OP3_404_63307_20141119_192444_inLine
+BABEL_OP3_404_63307_20141119_192444_outLine
+BABEL_OP3_404_63445_20141021_013007_inLine
+BABEL_OP3_404_63445_20141021_013007_outLine
+BABEL_OP3_404_63523_20150512_050203_inLine
+BABEL_OP3_404_63523_20150512_050203_outLine
+BABEL_OP3_404_63604_20141011_021042_inLine
+BABEL_OP3_404_63604_20141011_021042_outLine
+BABEL_OP3_404_63787_20141010_225937_inLine
+BABEL_OP3_404_63787_20141010_225937_outLine
+BABEL_OP3_404_63938_20150526_052814_inLine
+BABEL_OP3_404_63938_20150526_052814_outLine
+BABEL_OP3_404_64350_20141022_195842_inLine
+BABEL_OP3_404_64350_20141022_195842_outLine
+BABEL_OP3_404_64398_20141126_031756_inLine
+BABEL_OP3_404_64398_20141126_031756_outLine
+BABEL_OP3_404_64902_20150522_041540_inLine
+BABEL_OP3_404_64902_20150522_041540_outLine
+BABEL_OP3_404_65064_20141127_003631_inLine
+BABEL_OP3_404_65064_20141127_003631_outLine
+BABEL_OP3_404_65077_20141015_025834_inLine
+BABEL_OP3_404_65077_20141015_025834_outLine
+BABEL_OP3_404_65466_20150524_182317_inLine
+BABEL_OP3_404_65466_20150524_182317_outLine
+BABEL_OP3_404_65477_20141115_020305_inLine
+BABEL_OP3_404_65477_20141115_020305_outLine
+BABEL_OP3_404_65692_20141117_074414_inLine
+BABEL_OP3_404_65692_20141117_074414_outLine
+BABEL_OP3_404_65723_20141102_051040_inLine
+BABEL_OP3_404_65723_20141102_051040_outLine
+BABEL_OP3_404_65882_20141024_191236_inLine
+BABEL_OP3_404_65882_20141024_191236_outLine
+BABEL_OP3_404_66001_20141006_015944_inLine
+BABEL_OP3_404_66001_20141006_015944_outLine
+BABEL_OP3_404_66026_20141130_061639_inLine
+BABEL_OP3_404_66026_20141130_061639_outLine
+BABEL_OP3_404_66350_20150212_043953_inLine
+BABEL_OP3_404_66350_20150212_043953_outLine
+BABEL_OP3_404_66959_20141130_212725_inLine
+BABEL_OP3_404_66959_20141130_212725_outLine
+BABEL_OP3_404_66975_20150119_001417_inLine
+BABEL_OP3_404_66975_20150119_001417_outLine
+BABEL_OP3_404_67066_20150611_043029_inLine
+BABEL_OP3_404_67066_20150611_043029_outLine
+BABEL_OP3_404_67283_20141008_234315_inLine
+BABEL_OP3_404_67283_20141008_234315_outLine
+BABEL_OP3_404_67373_20141106_191525_inLine
+BABEL_OP3_404_67373_20141106_191525_outLine
+BABEL_OP3_404_67373_20141106_192955_inLine
+BABEL_OP3_404_67373_20141106_192955_outLine
+BABEL_OP3_404_67622_20141021_002234_inLine
+BABEL_OP3_404_67622_20141021_002234_outLine
+BABEL_OP3_404_67659_20141101_010904_inLine
+BABEL_OP3_404_67659_20141101_010904_outLine
+BABEL_OP3_404_67964_20150515_011635_inLine
+BABEL_OP3_404_67964_20150515_011635_outLine
+BABEL_OP3_404_68040_20141118_235516_inLine
+BABEL_OP3_404_68040_20141118_235516_outLine
+BABEL_OP3_404_68748_20141123_003226_inLine
+BABEL_OP3_404_68748_20141123_003226_outLine
+BABEL_OP3_404_68854_20150512_025452_inLine
+BABEL_OP3_404_68854_20150512_025452_outLine
+BABEL_OP3_404_68924_20141119_025325_inLine
+BABEL_OP3_404_68924_20141119_025325_outLine
+BABEL_OP3_404_69992_20141014_035441_inLine
+BABEL_OP3_404_69992_20141014_035441_outLine
+BABEL_OP3_404_70110_20141020_043016_inLine
+BABEL_OP3_404_70110_20141020_043016_outLine
+BABEL_OP3_404_70251_20141009_221726_inLine
+BABEL_OP3_404_70251_20141009_221726_outLine
+BABEL_OP3_404_70293_20150118_220441_inLine
+BABEL_OP3_404_70293_20150118_220441_outLine
+BABEL_OP3_404_70343_20141126_030147_inLine
+BABEL_OP3_404_70343_20141126_030147_outLine
+BABEL_OP3_404_70386_20141029_002717_inLine
+BABEL_OP3_404_70386_20141029_002717_outLine
+BABEL_OP3_404_70452_20141028_031043_inLine
+BABEL_OP3_404_70452_20141028_031043_outLine
+BABEL_OP3_404_70601_20141103_194852_inLine
+BABEL_OP3_404_70601_20141103_194852_outLine
+BABEL_OP3_404_71704_20141021_001821_inLine
+BABEL_OP3_404_71704_20141021_001821_outLine
+BABEL_OP3_404_71704_20141021_002603_inLine
+BABEL_OP3_404_71704_20141021_002603_outLine
+BABEL_OP3_404_72007_20141201_045843_inLine
+BABEL_OP3_404_72007_20141201_045843_outLine
+BABEL_OP3_404_72040_20141103_035957_inLine
+BABEL_OP3_404_72040_20141103_035957_outLine
+BABEL_OP3_404_72040_20141103_042101_inLine
+BABEL_OP3_404_72040_20141103_042101_outLine
+BABEL_OP3_404_72110_20141128_013317_inLine
+BABEL_OP3_404_72110_20141128_013317_outLine
+BABEL_OP3_404_72324_20141201_013717_inLine
+BABEL_OP3_404_72324_20141201_013717_outLine
+BABEL_OP3_404_72654_20141110_003307_inLine
+BABEL_OP3_404_72654_20141110_003307_outLine
+BABEL_OP3_404_73042_20141022_163748_inLine
+BABEL_OP3_404_73042_20141022_163748_outLine
+BABEL_OP3_404_73301_20141101_210322_inLine
+BABEL_OP3_404_73301_20141101_210322_outLine
+BABEL_OP3_404_73446_20150513_002217_inLine
+BABEL_OP3_404_73446_20150513_002217_outLine
+BABEL_OP3_404_73511_20141129_045420_inLine
+BABEL_OP3_404_73511_20141129_045420_outLine
+BABEL_OP3_404_73549_20150619_204148_inLine
+BABEL_OP3_404_73549_20150619_204148_outLine
+BABEL_OP3_404_73591_20141018_022404_inLine
+BABEL_OP3_404_73591_20141018_022404_outLine
+BABEL_OP3_404_73622_20141016_060513_inLine
+BABEL_OP3_404_73622_20141016_060513_outLine
+BABEL_OP3_404_73814_20141120_180559_inLine
+BABEL_OP3_404_73814_20141120_180559_outLine
+BABEL_OP3_404_74226_20141130_235823_inLine
+BABEL_OP3_404_74226_20141130_235823_outLine
+BABEL_OP3_404_74253_20141201_231036_inLine
+BABEL_OP3_404_74253_20141201_231036_outLine
+BABEL_OP3_404_74280_20141010_230433_inLine
+BABEL_OP3_404_74280_20141010_230433_outLine
+BABEL_OP3_404_74667_20141114_221123_inLine
+BABEL_OP3_404_74667_20141114_221123_outLine
+BABEL_OP3_404_74886_20141022_200909_inLine
+BABEL_OP3_404_74886_20141022_200909_outLine
+BABEL_OP3_404_74921_20141124_030609_inLine
+BABEL_OP3_404_74921_20141124_030609_outLine
+BABEL_OP3_404_75223_20141012_224637_inLine
+BABEL_OP3_404_75223_20141012_224637_outLine
+BABEL_OP3_404_75342_20141130_193132_inLine
+BABEL_OP3_404_75342_20141130_193132_outLine
+BABEL_OP3_404_75930_20150206_063407_inLine
+BABEL_OP3_404_75930_20150206_063407_outLine
+BABEL_OP3_404_75993_20141102_192754_inLine
+BABEL_OP3_404_75993_20141102_192754_outLine
+BABEL_OP3_404_76155_20141118_052757_inLine
+BABEL_OP3_404_76155_20141118_052757_outLine
+BABEL_OP3_404_76218_20141119_232010_inLine
+BABEL_OP3_404_76218_20141119_232010_outLine
+BABEL_OP3_404_76499_20141117_005535_inLine
+BABEL_OP3_404_76499_20141117_005535_outLine
+BABEL_OP3_404_76756_20141120_014151_inLine
+BABEL_OP3_404_76756_20141120_014151_outLine
+BABEL_OP3_404_77033_20150503_233304_inLine
+BABEL_OP3_404_77033_20150503_233304_outLine
+BABEL_OP3_404_77112_20141105_062419_inLine
+BABEL_OP3_404_77112_20141105_062419_outLine
+BABEL_OP3_404_77139_20141022_022951_inLine
+BABEL_OP3_404_77139_20141022_022951_outLine
+BABEL_OP3_404_77744_20141103_034001_inLine
+BABEL_OP3_404_77744_20141103_034001_outLine
+BABEL_OP3_404_78116_20141128_231322_inLine
+BABEL_OP3_404_78116_20141128_231322_outLine
+BABEL_OP3_404_78194_20141019_052949_inLine
+BABEL_OP3_404_78194_20141019_052949_outLine
+BABEL_OP3_404_78398_20141022_235403_inLine
+BABEL_OP3_404_78398_20141022_235403_outLine
+BABEL_OP3_404_78544_20141130_192658_inLine
+BABEL_OP3_404_78544_20141130_192658_outLine
+BABEL_OP3_404_78604_20141022_164244_inLine
+BABEL_OP3_404_78604_20141022_164244_outLine
+BABEL_OP3_404_78630_20141025_220904_inLine
+BABEL_OP3_404_78630_20141025_220904_outLine
+BABEL_OP3_404_78743_20141202_001451_inLine
+BABEL_OP3_404_78743_20141202_001451_outLine
+BABEL_OP3_404_78943_20141025_004503_inLine
+BABEL_OP3_404_78943_20141025_004503_outLine
+BABEL_OP3_404_79028_20150213_002817_inLine
+BABEL_OP3_404_79028_20150213_002817_outLine
+BABEL_OP3_404_79107_20150614_013139_inLine
+BABEL_OP3_404_79107_20150614_013139_outLine
+BABEL_OP3_404_79129_20141110_183305_inLine
+BABEL_OP3_404_79129_20141110_183305_outLine
+BABEL_OP3_404_79367_20141008_232735_inLine
+BABEL_OP3_404_79367_20141008_232735_outLine
+BABEL_OP3_404_79451_20141031_025601_inLine
+BABEL_OP3_404_79451_20141031_025601_outLine
+BABEL_OP3_404_79995_20141201_013108_inLine
+BABEL_OP3_404_79995_20141201_013108_outLine
+BABEL_OP3_404_80622_20141119_054644_inLine
+BABEL_OP3_404_80622_20141119_054644_outLine
+BABEL_OP3_404_80721_20141201_013404_inLine
+BABEL_OP3_404_80721_20141201_013404_outLine
+BABEL_OP3_404_81287_20141130_024232_inLine
+BABEL_OP3_404_81287_20141130_024232_outLine
+BABEL_OP3_404_81392_20141130_022613_inLine
+BABEL_OP3_404_81392_20141130_022613_outLine
+BABEL_OP3_404_81392_20141130_023326_inLine
+BABEL_OP3_404_81392_20141130_023326_outLine
+BABEL_OP3_404_81404_20141104_055546_inLine
+BABEL_OP3_404_81404_20141104_055546_outLine
+BABEL_OP3_404_81433_20141119_073031_inLine
+BABEL_OP3_404_81433_20141119_073031_outLine
+BABEL_OP3_404_81435_20141128_235050_inLine
+BABEL_OP3_404_81435_20141128_235050_outLine
+BABEL_OP3_404_81622_20141129_212937_inLine
+BABEL_OP3_404_81622_20141129_212937_outLine
+BABEL_OP3_404_81810_20141126_051528_inLine
+BABEL_OP3_404_81810_20141126_051528_outLine
+BABEL_OP3_404_82030_20150517_193420_inLine
+BABEL_OP3_404_82030_20150517_193420_outLine
+BABEL_OP3_404_82035_20141119_063429_inLine
+BABEL_OP3_404_82035_20141119_063429_outLine
+BABEL_OP3_404_82138_20141116_234338_inLine
+BABEL_OP3_404_82138_20141116_234338_outLine
+BABEL_OP3_404_82140_20141117_021927_inLine
+BABEL_OP3_404_82140_20141117_021927_outLine
+BABEL_OP3_404_82145_20150502_232707_inLine
+BABEL_OP3_404_82145_20150502_232707_outLine
+BABEL_OP3_404_82391_20141128_063323_inLine
+BABEL_OP3_404_82391_20141128_063323_outLine
+BABEL_OP3_404_82496_20141009_062659_inLine
+BABEL_OP3_404_82496_20141009_062659_outLine
+BABEL_OP3_404_82622_20141008_042910_inLine
+BABEL_OP3_404_82622_20141008_042910_outLine
+BABEL_OP3_404_82904_20150523_231750_inLine
+BABEL_OP3_404_82904_20150523_231750_outLine
+BABEL_OP3_404_83455_20141112_000643_inLine
+BABEL_OP3_404_83455_20141112_000643_outLine
+BABEL_OP3_404_83783_20141115_005815_inLine
+BABEL_OP3_404_83783_20141115_005815_outLine
+BABEL_OP3_404_83935_20141201_214527_inLine
+BABEL_OP3_404_83935_20141201_214527_outLine
+BABEL_OP3_404_84327_20141130_185722_inLine
+BABEL_OP3_404_84327_20141130_185722_outLine
+BABEL_OP3_404_84408_20141105_182756_inLine
+BABEL_OP3_404_84408_20141105_182756_outLine
+BABEL_OP3_404_84469_20141130_030156_inLine
+BABEL_OP3_404_84469_20141130_030156_outLine
+BABEL_OP3_404_84547_20141022_025230_inLine
+BABEL_OP3_404_84547_20141022_025230_outLine
+BABEL_OP3_404_84605_20141026_234127_inLine
+BABEL_OP3_404_84605_20141026_234127_outLine
+BABEL_OP3_404_84611_20141024_005352_inLine
+BABEL_OP3_404_84611_20141024_005352_outLine
+BABEL_OP3_404_84768_20141012_183416_inLine
+BABEL_OP3_404_84768_20141012_183416_outLine
+BABEL_OP3_404_84823_20141201_061552_inLine
+BABEL_OP3_404_84823_20141201_061552_outLine
+BABEL_OP3_404_84936_20141130_025359_inLine
+BABEL_OP3_404_84936_20141130_025359_outLine
+BABEL_OP3_404_85647_20141111_231451_inLine
+BABEL_OP3_404_85647_20141111_231451_outLine
+BABEL_OP3_404_86321_20141127_025302_inLine
+BABEL_OP3_404_86321_20141127_025302_outLine
+BABEL_OP3_404_86433_20141201_005203_inLine
+BABEL_OP3_404_86433_20141201_005203_outLine
+BABEL_OP3_404_86433_20141201_010208_inLine
+BABEL_OP3_404_86433_20141201_010208_outLine
+BABEL_OP3_404_86433_20141201_011757_inLine
+BABEL_OP3_404_86433_20141201_011757_outLine
+BABEL_OP3_404_86467_20141019_022847_inLine
+BABEL_OP3_404_86467_20141019_022847_outLine
+BABEL_OP3_404_86467_20141019_024243_inLine
+BABEL_OP3_404_86467_20141019_024243_outLine
+BABEL_OP3_404_86557_20141021_041027_inLine
+BABEL_OP3_404_86557_20141021_041027_outLine
+BABEL_OP3_404_86676_20141125_223657_inLine
+BABEL_OP3_404_86676_20141125_223657_outLine
+BABEL_OP3_404_86952_20141008_194318_inLine
+BABEL_OP3_404_86952_20141008_194318_outLine
+BABEL_OP3_404_87073_20141007_223759_inLine
+BABEL_OP3_404_87073_20141007_223759_outLine
+BABEL_OP3_404_87280_20141201_232519_inLine
+BABEL_OP3_404_87280_20141201_232519_outLine
+BABEL_OP3_404_87693_20141105_002311_inLine
+BABEL_OP3_404_87693_20141105_002311_outLine
+BABEL_OP3_404_88601_20141115_021916_inLine
+BABEL_OP3_404_88601_20141115_021916_outLine
+BABEL_OP3_404_88601_20141115_024632_inLine
+BABEL_OP3_404_88601_20141115_024632_outLine
+BABEL_OP3_404_88686_20141019_023828_inLine
+BABEL_OP3_404_88686_20141019_023828_outLine
+BABEL_OP3_404_88925_20141201_043633_inLine
+BABEL_OP3_404_88925_20141201_043633_outLine
+BABEL_OP3_404_88982_20141106_212556_inLine
+BABEL_OP3_404_88982_20141106_212556_outLine
+BABEL_OP3_404_89358_20141119_055634_inLine
+BABEL_OP3_404_89358_20141119_055634_outLine
+BABEL_OP3_404_89695_20141115_212119_inLine
+BABEL_OP3_404_89695_20141115_212119_outLine
+BABEL_OP3_404_89794_20141130_055655_inLine
+BABEL_OP3_404_89794_20141130_055655_outLine
+BABEL_OP3_404_89877_20141120_061055_inLine
+BABEL_OP3_404_89877_20141120_061055_outLine
+BABEL_OP3_404_90417_20150611_052409_inLine
+BABEL_OP3_404_90417_20150611_052409_outLine
+BABEL_OP3_404_90737_20141116_233627_inLine
+BABEL_OP3_404_90737_20141116_233627_outLine
+BABEL_OP3_404_90739_20141116_034352_inLine
+BABEL_OP3_404_90739_20141116_034352_outLine
+BABEL_OP3_404_90777_20141115_012657_inLine
+BABEL_OP3_404_90777_20141115_012657_outLine
+BABEL_OP3_404_90935_20141104_195620_inLine
+BABEL_OP3_404_90935_20141104_195620_outLine
+BABEL_OP3_404_91080_20141119_062453_inLine
+BABEL_OP3_404_91080_20141119_062453_outLine
+BABEL_OP3_404_91125_20141010_234127_inLine
+BABEL_OP3_404_91125_20141010_234127_outLine
+BABEL_OP3_404_91336_20141110_011202_inLine
+BABEL_OP3_404_91336_20141110_011202_outLine
+BABEL_OP3_404_92065_20141201_041019_inLine
+BABEL_OP3_404_92065_20141201_041019_outLine
+BABEL_OP3_404_92077_20150610_053919_inLine
+BABEL_OP3_404_92077_20150610_053919_outLine
+BABEL_OP3_404_92459_20141026_000227_inLine
+BABEL_OP3_404_92459_20141026_000227_outLine
+BABEL_OP3_404_92459_20141026_000839_inLine
+BABEL_OP3_404_92459_20141026_000839_outLine
+BABEL_OP3_404_92509_20141020_034921_inLine
+BABEL_OP3_404_92509_20141020_034921_outLine
+BABEL_OP3_404_92527_20141115_024550_inLine
+BABEL_OP3_404_92527_20141115_024550_outLine
+BABEL_OP3_404_92809_20141009_080406_inLine
+BABEL_OP3_404_92809_20141009_080406_outLine
+BABEL_OP3_404_92886_20141103_032433_inLine
+BABEL_OP3_404_92886_20141103_032433_outLine
+BABEL_OP3_404_92941_20141027_175733_inLine
+BABEL_OP3_404_92941_20141027_175733_outLine
+BABEL_OP3_404_92941_20141027_180356_inLine
+BABEL_OP3_404_92941_20141027_180356_outLine
+BABEL_OP3_404_93224_20141119_210156_inLine
+BABEL_OP3_404_93224_20141119_210156_outLine
+BABEL_OP3_404_93411_20141119_193212_inLine
+BABEL_OP3_404_93411_20141119_193212_outLine
+BABEL_OP3_404_93861_20141111_181324_inLine
+BABEL_OP3_404_93861_20141111_181324_outLine
+BABEL_OP3_404_93946_20141129_015946_inLine
+BABEL_OP3_404_93946_20141129_015946_outLine
+BABEL_OP3_404_93964_20141111_213251_inLine
+BABEL_OP3_404_93964_20141111_213251_outLine
+BABEL_OP3_404_94141_20150516_175827_inLine
+BABEL_OP3_404_94141_20150516_175827_outLine
+BABEL_OP3_404_94253_20141029_184039_inLine
+BABEL_OP3_404_94253_20141029_184039_outLine
+BABEL_OP3_404_94409_20141117_003829_inLine
+BABEL_OP3_404_94409_20141117_003829_outLine
+BABEL_OP3_404_94666_20141119_231115_inLine
+BABEL_OP3_404_94666_20141119_231115_outLine
+BABEL_OP3_404_94745_20141201_033432_inLine
+BABEL_OP3_404_94745_20141201_033432_outLine
+BABEL_OP3_404_94923_20141116_230334_inLine
+BABEL_OP3_404_94923_20141116_230334_outLine
+BABEL_OP3_404_94978_20150528_024921_inLine
+BABEL_OP3_404_94978_20150528_024921_outLine
+BABEL_OP3_404_95294_20141129_062228_inLine
+BABEL_OP3_404_95294_20141129_062228_outLine
+BABEL_OP3_404_95467_20150612_031400_inLine
+BABEL_OP3_404_95467_20150612_031400_outLine
+BABEL_OP3_404_95490_20141021_050016_inLine
+BABEL_OP3_404_95490_20141021_050016_outLine
+BABEL_OP3_404_95663_20141022_043520_inLine
+BABEL_OP3_404_95663_20141022_043520_outLine
+BABEL_OP3_404_95670_20141019_224431_inLine
+BABEL_OP3_404_95670_20141019_224431_outLine
+BABEL_OP3_404_95677_20150220_205948_inLine
+BABEL_OP3_404_95677_20150220_205948_outLine
+BABEL_OP3_404_95942_20150514_235402_inLine
+BABEL_OP3_404_95942_20150514_235402_outLine
+BABEL_OP3_404_96088_20150524_191148_inLine
+BABEL_OP3_404_96088_20150524_191148_outLine
+BABEL_OP3_404_96190_20141107_040725_inLine
+BABEL_OP3_404_96190_20141107_040725_outLine
+BABEL_OP3_404_96405_20141026_045704_inLine
+BABEL_OP3_404_96405_20141026_045704_outLine
+BABEL_OP3_404_96820_20141109_204448_inLine
+BABEL_OP3_404_96820_20141109_204448_outLine
+BABEL_OP3_404_96842_20150610_040559_inLine
+BABEL_OP3_404_96842_20150610_040559_outLine
+BABEL_OP3_404_96910_20141026_195400_inLine
+BABEL_OP3_404_96910_20141026_195400_outLine
+BABEL_OP3_404_96934_20141025_223703_inLine
+BABEL_OP3_404_96934_20141025_223703_outLine
+BABEL_OP3_404_96934_20141025_225156_inLine
+BABEL_OP3_404_96934_20141025_225156_outLine
+BABEL_OP3_404_96985_20141013_053332_inLine
+BABEL_OP3_404_96985_20141013_053332_outLine
+BABEL_OP3_404_97363_20141120_034843_inLine
+BABEL_OP3_404_97363_20141120_034843_outLine
+BABEL_OP3_404_97570_20141120_050344_inLine
+BABEL_OP3_404_97570_20141120_050344_outLine
+BABEL_OP3_404_98311_20141022_042555_inLine
+BABEL_OP3_404_98311_20141022_042555_outLine
+BABEL_OP3_404_98356_20141123_013523_inLine
+BABEL_OP3_404_98356_20141123_013523_outLine
+BABEL_OP3_404_98390_20141014_024134_inLine
+BABEL_OP3_404_98390_20141014_024134_outLine
+BABEL_OP3_404_98565_20150217_195949_inLine
+BABEL_OP3_404_98565_20150217_195949_outLine
+BABEL_OP3_404_98580_20141130_022138_inLine
+BABEL_OP3_404_98580_20141130_022138_outLine
+BABEL_OP3_404_98909_20141027_032903_inLine
+BABEL_OP3_404_98909_20141027_032903_outLine
+BABEL_OP3_404_99516_20141019_071828_inLine
+BABEL_OP3_404_99516_20141019_071828_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/sub-train.list b/egs/babel/s5d/conf/lists/404-georgian/sub-train.list
new file mode 100644
index 00000000000..a042ee569ef
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/sub-train.list
@@ -0,0 +1,124 @@
+BABEL_OP3_404_11663_20141118_032146_inLine
+BABEL_OP3_404_11663_20141118_032146_outLine
+BABEL_OP3_404_12242_20141028_021853_inLine
+BABEL_OP3_404_12242_20141028_021853_outLine
+BABEL_OP3_404_13178_20141129_192909_inLine
+BABEL_OP3_404_13178_20141129_192909_outLine
+BABEL_OP3_404_14137_20141025_202817_inLine
+BABEL_OP3_404_14137_20141025_202817_outLine
+BABEL_OP3_404_14875_20141026_230227_inLine
+BABEL_OP3_404_14875_20141026_230227_outLine
+BABEL_OP3_404_15869_20150218_225936_inLine
+BABEL_OP3_404_15869_20150218_225936_outLine
+BABEL_OP3_404_17113_20150611_050102_inLine
+BABEL_OP3_404_17113_20150611_050102_outLine
+BABEL_OP3_404_23505_20141021_032033_inLine
+BABEL_OP3_404_23505_20141021_032033_outLine
+BABEL_OP3_404_24470_20141111_184651_inLine
+BABEL_OP3_404_24470_20141111_184651_outLine
+BABEL_OP3_404_24470_20141111_190229_inLine
+BABEL_OP3_404_24470_20141111_190229_outLine
+BABEL_OP3_404_24679_20141018_015615_inLine
+BABEL_OP3_404_24679_20141018_015615_outLine
+BABEL_OP3_404_26388_20141026_014207_inLine
+BABEL_OP3_404_26388_20141026_014207_outLine
+BABEL_OP3_404_27042_20141201_215107_inLine
+BABEL_OP3_404_27042_20141201_215107_outLine
+BABEL_OP3_404_28538_20141119_005526_inLine
+BABEL_OP3_404_28538_20141119_005526_outLine
+BABEL_OP3_404_29208_20141106_013309_inLine
+BABEL_OP3_404_29208_20141106_013309_outLine
+BABEL_OP3_404_30461_20150620_020316_inLine
+BABEL_OP3_404_30461_20150620_020316_outLine
+BABEL_OP3_404_31979_20141106_000523_inLine
+BABEL_OP3_404_31979_20141106_000523_outLine
+BABEL_OP3_404_31992_20141014_221817_inLine
+BABEL_OP3_404_31992_20141014_221817_outLine
+BABEL_OP3_404_37064_20141102_063308_inLine
+BABEL_OP3_404_37064_20141102_063308_outLine
+BABEL_OP3_404_37281_20141119_053453_inLine
+BABEL_OP3_404_37281_20141119_053453_outLine
+BABEL_OP3_404_37853_20150602_030625_inLine
+BABEL_OP3_404_37853_20150602_030625_outLine
+BABEL_OP3_404_40713_20141028_221207_inLine
+BABEL_OP3_404_40713_20141028_221207_outLine
+BABEL_OP3_404_41680_20141012_040411_inLine
+BABEL_OP3_404_41680_20141012_040411_outLine
+BABEL_OP3_404_41920_20141008_040539_inLine
+BABEL_OP3_404_41920_20141008_040539_outLine
+BABEL_OP3_404_42877_20150212_052937_inLine
+BABEL_OP3_404_42877_20150212_052937_outLine
+BABEL_OP3_404_45121_20150609_055234_inLine
+BABEL_OP3_404_45121_20150609_055234_outLine
+BABEL_OP3_404_46169_20141130_224339_inLine
+BABEL_OP3_404_46169_20141130_224339_outLine
+BABEL_OP3_404_46625_20141011_040505_inLine
+BABEL_OP3_404_46625_20141011_040505_outLine
+BABEL_OP3_404_46681_20141021_040451_inLine
+BABEL_OP3_404_46681_20141021_040451_outLine
+BABEL_OP3_404_47270_20150512_053415_inLine
+BABEL_OP3_404_47270_20150512_053415_outLine
+BABEL_OP3_404_48844_20141020_065414_inLine
+BABEL_OP3_404_48844_20141020_065414_outLine
+BABEL_OP3_404_49768_20141026_022902_inLine
+BABEL_OP3_404_49768_20141026_022902_outLine
+BABEL_OP3_404_50175_20141021_025726_inLine
+BABEL_OP3_404_50175_20141021_025726_outLine
+BABEL_OP3_404_52301_20141009_051739_inLine
+BABEL_OP3_404_52301_20141009_051739_outLine
+BABEL_OP3_404_52301_20141009_054049_inLine
+BABEL_OP3_404_52301_20141009_054049_outLine
+BABEL_OP3_404_52490_20141016_020323_inLine
+BABEL_OP3_404_52490_20141016_020323_outLine
+BABEL_OP3_404_56213_20141201_000837_inLine
+BABEL_OP3_404_56213_20141201_000837_outLine
+BABEL_OP3_404_58103_20141030_002209_inLine
+BABEL_OP3_404_58103_20141030_002209_outLine
+BABEL_OP3_404_59078_20141111_004941_inLine
+BABEL_OP3_404_59078_20141111_004941_outLine
+BABEL_OP3_404_61225_20141009_174003_inLine
+BABEL_OP3_404_61225_20141009_174003_outLine
+BABEL_OP3_404_63220_20141127_033605_inLine
+BABEL_OP3_404_63220_20141127_033605_outLine
+BABEL_OP3_404_64494_20141026_203549_inLine
+BABEL_OP3_404_64494_20141026_203549_outLine
+BABEL_OP3_404_64768_20141027_201818_inLine
+BABEL_OP3_404_64768_20141027_201818_outLine
+BABEL_OP3_404_66916_20141022_000731_inLine
+BABEL_OP3_404_66916_20141022_000731_outLine
+BABEL_OP3_404_67401_20141109_211809_inLine
+BABEL_OP3_404_67401_20141109_211809_outLine
+BABEL_OP3_404_68059_20141109_052011_inLine
+BABEL_OP3_404_68059_20141109_052011_outLine
+BABEL_OP3_404_68068_20141201_054518_inLine
+BABEL_OP3_404_68068_20141201_054518_outLine
+BABEL_OP3_404_68384_20141130_035214_inLine
+BABEL_OP3_404_68384_20141130_035214_outLine
+BABEL_OP3_404_68627_20141105_190511_inLine
+BABEL_OP3_404_68627_20141105_190511_outLine
+BABEL_OP3_404_72844_20141007_033837_inLine
+BABEL_OP3_404_72844_20141007_033837_outLine
+BABEL_OP3_404_73837_20141026_191037_inLine
+BABEL_OP3_404_73837_20141026_191037_outLine
+BABEL_OP3_404_78511_20141201_003606_inLine
+BABEL_OP3_404_78511_20141201_003606_outLine
+BABEL_OP3_404_79139_20141117_054733_inLine
+BABEL_OP3_404_79139_20141117_054733_outLine
+BABEL_OP3_404_81971_20141022_025641_inLine
+BABEL_OP3_404_81971_20141022_025641_outLine
+BABEL_OP3_404_83062_20150523_220236_inLine
+BABEL_OP3_404_83062_20150523_220236_outLine
+BABEL_OP3_404_83775_20141030_230742_inLine
+BABEL_OP3_404_83775_20141030_230742_outLine
+BABEL_OP3_404_84339_20150502_014143_inLine
+BABEL_OP3_404_84339_20150502_014143_outLine
+BABEL_OP3_404_86191_20141027_013544_inLine
+BABEL_OP3_404_86191_20141027_013544_outLine
+BABEL_OP3_404_86888_20141119_022459_inLine
+BABEL_OP3_404_86888_20141119_022459_outLine
+BABEL_OP3_404_95966_20141129_060246_inLine
+BABEL_OP3_404_95966_20141129_060246_outLine
+BABEL_OP3_404_97461_20141118_230730_inLine
+BABEL_OP3_404_97461_20141118_230730_outLine
+BABEL_OP3_404_99487_20141021_053024_inLine
+BABEL_OP3_404_99487_20141021_053024_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list b/egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list
new file mode 100644
index 00000000000..32d863a65ad
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/sub-train.untranscribed.list
@@ -0,0 +1,929 @@
+BABEL_OP3_404_10019_20141101_191932_inLine
+BABEL_OP3_404_10019_20141101_191932_outLine
+BABEL_OP3_404_10058_20150526_034808_inLine
+BABEL_OP3_404_10411_20150611_172027_inLine
+BABEL_OP3_404_10411_20150611_172027_outLine
+BABEL_OP3_404_10416_20141117_064700_inLine
+BABEL_OP3_404_10416_20141117_064700_outLine
+BABEL_OP3_404_10647_20150514_001106_inLine
+BABEL_OP3_404_10647_20150514_001106_outLine
+BABEL_OP3_404_10938_20141030_023413_inLine
+BABEL_OP3_404_10938_20141030_023413_outLine
+BABEL_OP3_404_10974_20141119_205506_inLine
+BABEL_OP3_404_10974_20141119_205506_outLine
+BABEL_OP3_404_11352_20150513_002642_inLine
+BABEL_OP3_404_11352_20150513_002642_outLine
+BABEL_OP3_404_11673_20141023_035438_inLine
+BABEL_OP3_404_11673_20141023_035438_outLine
+BABEL_OP3_404_11681_20141107_190101_inLine
+BABEL_OP3_404_11681_20141107_190101_outLine
+BABEL_OP3_404_11859_20150611_041737_inLine
+BABEL_OP3_404_11859_20150611_041737_outLine
+BABEL_OP3_404_12220_20141116_205911_inLine
+BABEL_OP3_404_12220_20141116_205911_outLine
+BABEL_OP3_404_12609_20150524_172934_inLine
+BABEL_OP3_404_12609_20150524_172934_outLine
+BABEL_OP3_404_13030_20141101_200709_inLine
+BABEL_OP3_404_13030_20141101_200709_outLine
+BABEL_OP3_404_13126_20150524_221540_inLine
+BABEL_OP3_404_13126_20150524_221540_outLine
+BABEL_OP3_404_13324_20141022_200257_inLine
+BABEL_OP3_404_13324_20141022_200257_outLine
+BABEL_OP3_404_13664_20141012_013523_inLine
+BABEL_OP3_404_13664_20141012_013523_outLine
+BABEL_OP3_404_13709_20150512_015216_inLine
+BABEL_OP3_404_13709_20150512_015216_outLine
+BABEL_OP3_404_14158_20141130_030130_inLine
+BABEL_OP3_404_14158_20141130_030130_outLine
+BABEL_OP3_404_14229_20141029_200136_inLine
+BABEL_OP3_404_14229_20141029_200136_outLine
+BABEL_OP3_404_14237_20141006_171921_inLine
+BABEL_OP3_404_14237_20141006_171921_outLine
+BABEL_OP3_404_14440_20141127_213106_inLine
+BABEL_OP3_404_14440_20141127_213106_outLine
+BABEL_OP3_404_14807_20141110_231934_inLine
+BABEL_OP3_404_14807_20141110_231934_outLine
+BABEL_OP3_404_14899_20141022_202217_inLine
+BABEL_OP3_404_14899_20141022_202217_outLine
+BABEL_OP3_404_14929_20141129_192841_inLine
+BABEL_OP3_404_14929_20141129_192841_outLine
+BABEL_OP3_404_15024_20141118_234824_inLine
+BABEL_OP3_404_15024_20141118_234824_outLine
+BABEL_OP3_404_15042_20150506_232829_inLine
+BABEL_OP3_404_15042_20150506_232829_outLine
+BABEL_OP3_404_15382_20141130_213942_inLine
+BABEL_OP3_404_15382_20141130_213942_outLine
+BABEL_OP3_404_15535_20141129_021659_inLine
+BABEL_OP3_404_15535_20141129_021659_outLine
+BABEL_OP3_404_15638_20141127_220502_outLine
+BABEL_OP3_404_15848_20141006_231138_inLine
+BABEL_OP3_404_15848_20141006_231138_outLine
+BABEL_OP3_404_15902_20141020_173105_outLine
+BABEL_OP3_404_16149_20141010_173548_inLine
+BABEL_OP3_404_16149_20141010_173548_outLine
+BABEL_OP3_404_16467_20141130_014316_inLine
+BABEL_OP3_404_16467_20141130_014316_outLine
+BABEL_OP3_404_16467_20141130_015010_inLine
+BABEL_OP3_404_16467_20141130_015010_outLine
+BABEL_OP3_404_16475_20141116_052010_outLine
+BABEL_OP3_404_16601_20141201_041704_inLine
+BABEL_OP3_404_16601_20141201_041704_outLine
+BABEL_OP3_404_17280_20141103_190330_inLine
+BABEL_OP3_404_17280_20141103_190330_outLine
+BABEL_OP3_404_17320_20150524_213213_inLine
+BABEL_OP3_404_17320_20150524_213213_outLine
+BABEL_OP3_404_17420_20150503_201902_inLine
+BABEL_OP3_404_17420_20150503_201902_outLine
+BABEL_OP3_404_17420_20150527_025815_inLine
+BABEL_OP3_404_17420_20150527_025815_outLine
+BABEL_OP3_404_17420_20150527_034621_inLine
+BABEL_OP3_404_17420_20150527_034621_outLine
+BABEL_OP3_404_17520_20141113_032534_inLine
+BABEL_OP3_404_17567_20141117_182919_inLine
+BABEL_OP3_404_17567_20141117_182919_outLine
+BABEL_OP3_404_17573_20141129_035040_inLine
+BABEL_OP3_404_17573_20141129_035040_outLine
+BABEL_OP3_404_17615_20141201_025917_inLine
+BABEL_OP3_404_17615_20141201_025917_outLine
+BABEL_OP3_404_17890_20141128_040046_inLine
+BABEL_OP3_404_17890_20141128_040046_outLine
+BABEL_OP3_404_17923_20141022_231429_outLine
+BABEL_OP3_404_18118_20150503_165936_inLine
+BABEL_OP3_404_18118_20150503_165936_outLine
+BABEL_OP3_404_18291_20150611_062705_outLine
+BABEL_OP3_404_18291_20150611_063700_outLine
+BABEL_OP3_404_18766_20150610_064349_inLine
+BABEL_OP3_404_19120_20150525_014657_inLine
+BABEL_OP3_404_19120_20150525_014657_outLine
+BABEL_OP3_404_19120_20150525_015635_inLine
+BABEL_OP3_404_19120_20150525_015635_outLine
+BABEL_OP3_404_19134_20141120_053128_inLine
+BABEL_OP3_404_19134_20141120_053128_outLine
+BABEL_OP3_404_19703_20141027_004315_inLine
+BABEL_OP3_404_19703_20141027_004315_outLine
+BABEL_OP3_404_19877_20150506_202237_outLine
+BABEL_OP3_404_20133_20141010_195231_inLine
+BABEL_OP3_404_20133_20141010_195231_outLine
+BABEL_OP3_404_20454_20150218_171143_inLine
+BABEL_OP3_404_20454_20150218_171143_outLine
+BABEL_OP3_404_20985_20141126_183236_inLine
+BABEL_OP3_404_20985_20141126_183236_outLine
+BABEL_OP3_404_21004_20141201_035831_inLine
+BABEL_OP3_404_21004_20141201_035831_outLine
+BABEL_OP3_404_21159_20150615_021612_inLine
+BABEL_OP3_404_21435_20150523_030702_inLine
+BABEL_OP3_404_21435_20150523_030702_outLine
+BABEL_OP3_404_21581_20141101_011021_inLine
+BABEL_OP3_404_21581_20141101_011021_outLine
+BABEL_OP3_404_21807_20141112_225225_outLine
+BABEL_OP3_404_22280_20141111_020522_inLine
+BABEL_OP3_404_22280_20141111_020522_outLine
+BABEL_OP3_404_22591_20150217_220714_inLine
+BABEL_OP3_404_23046_20141031_030755_inLine
+BABEL_OP3_404_23046_20141031_030755_outLine
+BABEL_OP3_404_23731_20141130_033602_inLine
+BABEL_OP3_404_23731_20141130_033602_outLine
+BABEL_OP3_404_23980_20141106_225951_inLine
+BABEL_OP3_404_23980_20141106_225951_outLine
+BABEL_OP3_404_24209_20150212_224614_inLine
+BABEL_OP3_404_24239_20150517_203015_inLine
+BABEL_OP3_404_24270_20141111_012902_inLine
+BABEL_OP3_404_24270_20141111_012902_outLine
+BABEL_OP3_404_24323_20141117_020615_outLine
+BABEL_OP3_404_24501_20150522_030231_inLine
+BABEL_OP3_404_24532_20141007_211325_inLine
+BABEL_OP3_404_24532_20141007_211325_outLine
+BABEL_OP3_404_24586_20150524_190657_inLine
+BABEL_OP3_404_24586_20150524_190657_outLine
+BABEL_OP3_404_24589_20141031_020641_inLine
+BABEL_OP3_404_24589_20141031_020641_outLine
+BABEL_OP3_404_24590_20141116_230233_inLine
+BABEL_OP3_404_24590_20141116_230233_outLine
+BABEL_OP3_404_24982_20141102_021352_inLine
+BABEL_OP3_404_24982_20141102_021352_outLine
+BABEL_OP3_404_25068_20150206_022730_outLine
+BABEL_OP3_404_25085_20150611_040906_inLine
+BABEL_OP3_404_25085_20150611_040906_outLine
+BABEL_OP3_404_25412_20141120_031532_inLine
+BABEL_OP3_404_25412_20141120_031532_outLine
+BABEL_OP3_404_25496_20150613_034126_inLine
+BABEL_OP3_404_25496_20150613_034126_outLine
+BABEL_OP3_404_26398_20150527_032152_inLine
+BABEL_OP3_404_26398_20150527_032152_outLine
+BABEL_OP3_404_26478_20150617_004029_inLine
+BABEL_OP3_404_26478_20150617_004029_outLine
+BABEL_OP3_404_26836_20141102_024528_inLine
+BABEL_OP3_404_26836_20141102_024528_outLine
+BABEL_OP3_404_27203_20141119_185720_inLine
+BABEL_OP3_404_27203_20141119_185720_outLine
+BABEL_OP3_404_27203_20141119_191138_inLine
+BABEL_OP3_404_27203_20141119_191138_outLine
+BABEL_OP3_404_27590_20141128_051454_inLine
+BABEL_OP3_404_28280_20150619_024509_inLine
+BABEL_OP3_404_28280_20150619_024509_outLine
+BABEL_OP3_404_28280_20150619_025848_inLine
+BABEL_OP3_404_28280_20150619_025848_outLine
+BABEL_OP3_404_28303_20141028_182204_inLine
+BABEL_OP3_404_28303_20141028_182204_outLine
+BABEL_OP3_404_28522_20141124_222758_inLine
+BABEL_OP3_404_28522_20141124_222758_outLine
+BABEL_OP3_404_28600_20141201_223206_inLine
+BABEL_OP3_404_28600_20141201_223206_outLine
+BABEL_OP3_404_28871_20141019_181913_inLine
+BABEL_OP3_404_28871_20141019_181913_outLine
+BABEL_OP3_404_28945_20141104_060349_outLine
+BABEL_OP3_404_29039_20141128_035839_inLine
+BABEL_OP3_404_29039_20141128_035839_outLine
+BABEL_OP3_404_29076_20141109_215142_inLine
+BABEL_OP3_404_29076_20141109_215142_outLine
+BABEL_OP3_404_29230_20150611_051340_inLine
+BABEL_OP3_404_29230_20150611_051340_outLine
+BABEL_OP3_404_29439_20150524_201524_inLine
+BABEL_OP3_404_29439_20150524_201524_outLine
+BABEL_OP3_404_30098_20150610_150504_inLine
+BABEL_OP3_404_30098_20150610_150504_outLine
+BABEL_OP3_404_30432_20141126_052839_inLine
+BABEL_OP3_404_30432_20141126_052839_outLine
+BABEL_OP3_404_30497_20150525_194737_inLine
+BABEL_OP3_404_30497_20150525_194737_outLine
+BABEL_OP3_404_30645_20141019_220859_inLine
+BABEL_OP3_404_30653_20150514_014515_inLine
+BABEL_OP3_404_31267_20150615_011004_outLine
+BABEL_OP3_404_31484_20141122_232804_inLine
+BABEL_OP3_404_31484_20141122_232804_outLine
+BABEL_OP3_404_31624_20141105_214349_inLine
+BABEL_OP3_404_31624_20141105_214349_outLine
+BABEL_OP3_404_31919_20150526_220911_inLine
+BABEL_OP3_404_31919_20150526_220911_outLine
+BABEL_OP3_404_32122_20141115_022841_inLine
+BABEL_OP3_404_32122_20141115_022841_outLine
+BABEL_OP3_404_32287_20150210_060823_inLine
+BABEL_OP3_404_32287_20150210_060823_outLine
+BABEL_OP3_404_32630_20150609_012137_inLine
+BABEL_OP3_404_32630_20150609_012137_outLine
+BABEL_OP3_404_32708_20141106_032826_inLine
+BABEL_OP3_404_32708_20141106_032826_outLine
+BABEL_OP3_404_32727_20141128_203500_inLine
+BABEL_OP3_404_32727_20141128_203500_outLine
+BABEL_OP3_404_32727_20141128_204751_inLine
+BABEL_OP3_404_32727_20141128_204751_outLine
+BABEL_OP3_404_32959_20141201_005331_inLine
+BABEL_OP3_404_32959_20141201_005331_outLine
+BABEL_OP3_404_32998_20141112_054111_inLine
+BABEL_OP3_404_33355_20141019_032024_inLine
+BABEL_OP3_404_33355_20141019_032024_outLine
+BABEL_OP3_404_33355_20141019_034109_inLine
+BABEL_OP3_404_33355_20141019_034109_outLine
+BABEL_OP3_404_33704_20141207_073436_inLine
+BABEL_OP3_404_33704_20141207_073436_outLine
+BABEL_OP3_404_34328_20141119_054513_outLine
+BABEL_OP3_404_34328_20141119_055432_outLine
+BABEL_OP3_404_34679_20141102_052808_inLine
+BABEL_OP3_404_34679_20141102_052808_outLine
+BABEL_OP3_404_34688_20141009_073303_inLine
+BABEL_OP3_404_34688_20141009_073303_outLine
+BABEL_OP3_404_34811_20141109_001009_inLine
+BABEL_OP3_404_34811_20141109_001009_outLine
+BABEL_OP3_404_34899_20150611_060602_outLine
+BABEL_OP3_404_35008_20141201_023042_inLine
+BABEL_OP3_404_35008_20141201_023042_outLine
+BABEL_OP3_404_35143_20141130_181111_inLine
+BABEL_OP3_404_35143_20141130_181111_outLine
+BABEL_OP3_404_35181_20150526_211416_inLine
+BABEL_OP3_404_35181_20150526_211416_outLine
+BABEL_OP3_404_35706_20150523_015900_inLine
+BABEL_OP3_404_35706_20150523_015900_outLine
+BABEL_OP3_404_35786_20150604_015518_inLine
+BABEL_OP3_404_35786_20150604_015518_outLine
+BABEL_OP3_404_36017_20150528_192934_inLine
+BABEL_OP3_404_36017_20150528_192934_outLine
+BABEL_OP3_404_36039_20150526_230125_inLine
+BABEL_OP3_404_36039_20150526_230125_outLine
+BABEL_OP3_404_36059_20150601_023254_inLine
+BABEL_OP3_404_36059_20150601_023254_outLine
+BABEL_OP3_404_36059_20150601_033346_inLine
+BABEL_OP3_404_36059_20150601_033346_outLine
+BABEL_OP3_404_36147_20150211_013803_outLine
+BABEL_OP3_404_36219_20141104_012216_inLine
+BABEL_OP3_404_36219_20141104_012216_outLine
+BABEL_OP3_404_36642_20150610_161207_inLine
+BABEL_OP3_404_36642_20150610_161207_outLine
+BABEL_OP3_404_37290_20141115_050457_inLine
+BABEL_OP3_404_37290_20141115_050457_outLine
+BABEL_OP3_404_37598_20141119_045926_inLine
+BABEL_OP3_404_37598_20141119_045926_outLine
+BABEL_OP3_404_37682_20141101_221445_inLine
+BABEL_OP3_404_37682_20141101_221445_outLine
+BABEL_OP3_404_38125_20150526_233108_inLine
+BABEL_OP3_404_38125_20150526_233108_outLine
+BABEL_OP3_404_38323_20150615_021843_inLine
+BABEL_OP3_404_38340_20141103_231545_inLine
+BABEL_OP3_404_38340_20141103_231545_outLine
+BABEL_OP3_404_38554_20141010_224451_inLine
+BABEL_OP3_404_38554_20141010_224451_outLine
+BABEL_OP3_404_38588_20141118_163844_inLine
+BABEL_OP3_404_38588_20141118_163844_outLine
+BABEL_OP3_404_38664_20141030_175135_inLine
+BABEL_OP3_404_38664_20141030_175135_outLine
+BABEL_OP3_404_38979_20150503_202406_outLine
+BABEL_OP3_404_39099_20150511_053646_outLine
+BABEL_OP3_404_39307_20141022_200554_inLine
+BABEL_OP3_404_39307_20141022_201758_inLine
+BABEL_OP3_404_39426_20150527_181901_outLine
+BABEL_OP3_404_39744_20141023_002710_inLine
+BABEL_OP3_404_39893_20150611_034149_inLine
+BABEL_OP3_404_39920_20150503_205354_outLine
+BABEL_OP3_404_40557_20141127_200639_inLine
+BABEL_OP3_404_40557_20141127_200639_outLine
+BABEL_OP3_404_40939_20150210_212748_inLine
+BABEL_OP3_404_40939_20150210_212748_outLine
+BABEL_OP3_404_41097_20141129_055801_inLine
+BABEL_OP3_404_41097_20141129_055801_outLine
+BABEL_OP3_404_41100_20141021_022126_inLine
+BABEL_OP3_404_41100_20141021_022126_outLine
+BABEL_OP3_404_41272_20150503_232941_inLine
+BABEL_OP3_404_41334_20150617_041322_inLine
+BABEL_OP3_404_41400_20150515_021408_inLine
+BABEL_OP3_404_41609_20141009_013405_inLine
+BABEL_OP3_404_41609_20141009_013405_outLine
+BABEL_OP3_404_41692_20150604_005657_inLine
+BABEL_OP3_404_41692_20150604_005657_outLine
+BABEL_OP3_404_41745_20141114_235452_inLine
+BABEL_OP3_404_41745_20141114_235452_outLine
+BABEL_OP3_404_41958_20141029_212755_inLine
+BABEL_OP3_404_41958_20141029_212755_outLine
+BABEL_OP3_404_42155_20141127_055149_inLine
+BABEL_OP3_404_42619_20141130_012456_outLine
+BABEL_OP3_404_42834_20141125_004837_inLine
+BABEL_OP3_404_42834_20141125_004837_outLine
+BABEL_OP3_404_42883_20150604_035732_inLine
+BABEL_OP3_404_42883_20150604_035732_outLine
+BABEL_OP3_404_43368_20141031_010629_inLine
+BABEL_OP3_404_43368_20141031_010629_outLine
+BABEL_OP3_404_43388_20141114_212210_inLine
+BABEL_OP3_404_43388_20141114_214120_inLine
+BABEL_OP3_404_43588_20150517_233637_inLine
+BABEL_OP3_404_43789_20141120_011327_outLine
+BABEL_OP3_404_44114_20150614_012319_inLine
+BABEL_OP3_404_44114_20150614_012319_outLine
+BABEL_OP3_404_44309_20150525_022635_inLine
+BABEL_OP3_404_44309_20150525_022635_outLine
+BABEL_OP3_404_44477_20141201_180604_inLine
+BABEL_OP3_404_44477_20141201_180604_outLine
+BABEL_OP3_404_44478_20150512_225118_inLine
+BABEL_OP3_404_44847_20141130_221248_inLine
+BABEL_OP3_404_44847_20141130_221248_outLine
+BABEL_OP3_404_45106_20141119_050859_inLine
+BABEL_OP3_404_45106_20141119_050859_outLine
+BABEL_OP3_404_45374_20150122_014830_outLine
+BABEL_OP3_404_45374_20150122_015920_outLine
+BABEL_OP3_404_45459_20150525_020410_inLine
+BABEL_OP3_404_45459_20150525_020410_outLine
+BABEL_OP3_404_45560_20141012_030417_inLine
+BABEL_OP3_404_45560_20141012_030417_outLine
+BABEL_OP3_404_45699_20150205_021829_inLine
+BABEL_OP3_404_45851_20150514_155157_inLine
+BABEL_OP3_404_45851_20150514_155157_outLine
+BABEL_OP3_404_45908_20150515_004218_outLine
+BABEL_OP3_404_46268_20141019_032022_inLine
+BABEL_OP3_404_46268_20141019_032022_outLine
+BABEL_OP3_404_46310_20141015_051100_inLine
+BABEL_OP3_404_46310_20141015_051100_outLine
+BABEL_OP3_404_46315_20141129_012912_inLine
+BABEL_OP3_404_46315_20141129_012912_outLine
+BABEL_OP3_404_46550_20141105_072519_inLine
+BABEL_OP3_404_46550_20141105_072519_outLine
+BABEL_OP3_404_46688_20141015_211329_inLine
+BABEL_OP3_404_46688_20141015_211329_outLine
+BABEL_OP3_404_46712_20141027_224004_inLine
+BABEL_OP3_404_46712_20141027_224004_outLine
+BABEL_OP3_404_46881_20141012_020055_inLine
+BABEL_OP3_404_46881_20141012_020055_outLine
+BABEL_OP3_404_46974_20141128_055136_inLine
+BABEL_OP3_404_46974_20141128_055136_outLine
+BABEL_OP3_404_46976_20141107_183806_inLine
+BABEL_OP3_404_46976_20141107_183806_outLine
+BABEL_OP3_404_47156_20150625_025324_inLine
+BABEL_OP3_404_47156_20150625_025324_outLine
+BABEL_OP3_404_47802_20141110_200430_inLine
+BABEL_OP3_404_47802_20141110_200430_outLine
+BABEL_OP3_404_47823_20141201_044425_inLine
+BABEL_OP3_404_47823_20141201_044425_outLine
+BABEL_OP3_404_48016_20150615_000741_inLine
+BABEL_OP3_404_48016_20150615_000741_outLine
+BABEL_OP3_404_48243_20141023_200903_inLine
+BABEL_OP3_404_48243_20141023_200903_outLine
+BABEL_OP3_404_48610_20141013_011505_inLine
+BABEL_OP3_404_48610_20141013_012904_inLine
+BABEL_OP3_404_48663_20150512_202837_inLine
+BABEL_OP3_404_48663_20150512_202837_outLine
+BABEL_OP3_404_49197_20141117_024730_inLine
+BABEL_OP3_404_49197_20141117_024730_outLine
+BABEL_OP3_404_49306_20150524_003356_inLine
+BABEL_OP3_404_49306_20150524_003356_outLine
+BABEL_OP3_404_49630_20141128_020114_inLine
+BABEL_OP3_404_49630_20141128_020114_outLine
+BABEL_OP3_404_49767_20150613_050113_inLine
+BABEL_OP3_404_49767_20150613_050113_outLine
+BABEL_OP3_404_49775_20141011_005306_inLine
+BABEL_OP3_404_49775_20141011_005306_outLine
+BABEL_OP3_404_49902_20141101_175534_inLine
+BABEL_OP3_404_49902_20141101_175534_outLine
+BABEL_OP3_404_49907_20141103_050534_inLine
+BABEL_OP3_404_49907_20141103_050534_outLine
+BABEL_OP3_404_49945_20150610_154709_inLine
+BABEL_OP3_404_50601_20141127_032527_inLine
+BABEL_OP3_404_50601_20141127_032527_outLine
+BABEL_OP3_404_50745_20150513_162805_inLine
+BABEL_OP3_404_50745_20150513_162805_outLine
+BABEL_OP3_404_50779_20141115_012852_inLine
+BABEL_OP3_404_50779_20141115_012852_outLine
+BABEL_OP3_404_50810_20141007_234432_inLine
+BABEL_OP3_404_50810_20141007_234432_outLine
+BABEL_OP3_404_51015_20141123_193824_inLine
+BABEL_OP3_404_51015_20141123_193824_outLine
+BABEL_OP3_404_51414_20150604_001601_inLine
+BABEL_OP3_404_51414_20150604_001601_outLine
+BABEL_OP3_404_51484_20141202_000325_inLine
+BABEL_OP3_404_51484_20141202_000325_outLine
+BABEL_OP3_404_51701_20150620_010924_outLine
+BABEL_OP3_404_52070_20150620_014422_outLine
+BABEL_OP3_404_52070_20150620_020559_outLine
+BABEL_OP3_404_52246_20141118_035022_inLine
+BABEL_OP3_404_52246_20141118_035022_outLine
+BABEL_OP3_404_52246_20141118_040850_inLine
+BABEL_OP3_404_52246_20141118_040850_outLine
+BABEL_OP3_404_52404_20141125_004855_inLine
+BABEL_OP3_404_52404_20141125_004855_outLine
+BABEL_OP3_404_52725_20150522_222730_inLine
+BABEL_OP3_404_52725_20150522_222730_outLine
+BABEL_OP3_404_53063_20141201_005237_inLine
+BABEL_OP3_404_53063_20141201_005237_outLine
+BABEL_OP3_404_53072_20150518_015132_inLine
+BABEL_OP3_404_53415_20150503_225920_inLine
+BABEL_OP3_404_53415_20150503_225920_outLine
+BABEL_OP3_404_53492_20150525_055025_inLine
+BABEL_OP3_404_53492_20150525_055025_outLine
+BABEL_OP3_404_53665_20150526_004549_inLine
+BABEL_OP3_404_53917_20150503_205456_outLine
+BABEL_OP3_404_53957_20141201_051933_inLine
+BABEL_OP3_404_54104_20141008_214620_inLine
+BABEL_OP3_404_54104_20141008_214620_outLine
+BABEL_OP3_404_54160_20141009_180704_inLine
+BABEL_OP3_404_54160_20141009_180704_outLine
+BABEL_OP3_404_54160_20141009_184719_inLine
+BABEL_OP3_404_54160_20141009_184719_outLine
+BABEL_OP3_404_54160_20141009_185557_inLine
+BABEL_OP3_404_54160_20141009_185557_outLine
+BABEL_OP3_404_54405_20141117_054820_inLine
+BABEL_OP3_404_54405_20141117_054820_outLine
+BABEL_OP3_404_54477_20141211_033627_inLine
+BABEL_OP3_404_54477_20141211_033627_outLine
+BABEL_OP3_404_54744_20141015_012011_inLine
+BABEL_OP3_404_54744_20141015_012011_outLine
+BABEL_OP3_404_55013_20150525_222257_inLine
+BABEL_OP3_404_55013_20150525_222257_outLine
+BABEL_OP3_404_55259_20141029_225631_inLine
+BABEL_OP3_404_55259_20141029_225631_outLine
+BABEL_OP3_404_55267_20141130_212756_inLine
+BABEL_OP3_404_55349_20150523_031602_inLine
+BABEL_OP3_404_55349_20150523_031602_outLine
+BABEL_OP3_404_56019_20150502_020750_inLine
+BABEL_OP3_404_56019_20150502_020750_outLine
+BABEL_OP3_404_56076_20150516_164959_inLine
+BABEL_OP3_404_56076_20150516_164959_outLine
+BABEL_OP3_404_56331_20150526_020747_inLine
+BABEL_OP3_404_56331_20150526_020747_outLine
+BABEL_OP3_404_56743_20141114_223719_inLine
+BABEL_OP3_404_56743_20141114_223719_outLine
+BABEL_OP3_404_57065_20141201_002920_inLine
+BABEL_OP3_404_57219_20150618_045613_inLine
+BABEL_OP3_404_57219_20150618_045613_outLine
+BABEL_OP3_404_57464_20150523_224617_inLine
+BABEL_OP3_404_57542_20150526_233832_inLine
+BABEL_OP3_404_57542_20150526_233832_outLine
+BABEL_OP3_404_57542_20150526_235003_inLine
+BABEL_OP3_404_57542_20150526_235003_outLine
+BABEL_OP3_404_57654_20141023_235628_inLine
+BABEL_OP3_404_57654_20141023_235628_outLine
+BABEL_OP3_404_57678_20141104_023128_inLine
+BABEL_OP3_404_57678_20141104_023128_outLine
+BABEL_OP3_404_57919_20150127_041057_inLine
+BABEL_OP3_404_57919_20150127_041057_outLine
+BABEL_OP3_404_58006_20150526_024205_inLine
+BABEL_OP3_404_58006_20150526_024205_outLine
+BABEL_OP3_404_58026_20150615_004130_inLine
+BABEL_OP3_404_58026_20150615_004130_outLine
+BABEL_OP3_404_58915_20150611_034220_outLine
+BABEL_OP3_404_59262_20141130_212633_inLine
+BABEL_OP3_404_59262_20141130_212633_outLine
+BABEL_OP3_404_59307_20150504_003405_inLine
+BABEL_OP3_404_59307_20150504_003405_outLine
+BABEL_OP3_404_59720_20141029_204612_inLine
+BABEL_OP3_404_59720_20141029_204612_outLine
+BABEL_OP3_404_59864_20150602_014458_inLine
+BABEL_OP3_404_60026_20141008_051633_inLine
+BABEL_OP3_404_60026_20141008_051633_outLine
+BABEL_OP3_404_60299_20150611_040929_inLine
+BABEL_OP3_404_60310_20141130_231532_inLine
+BABEL_OP3_404_60310_20141130_231532_outLine
+BABEL_OP3_404_60352_20141201_060712_inLine
+BABEL_OP3_404_60352_20141201_060712_outLine
+BABEL_OP3_404_60352_20141201_061821_inLine
+BABEL_OP3_404_60352_20141201_061821_outLine
+BABEL_OP3_404_60458_20150609_021527_inLine
+BABEL_OP3_404_60458_20150609_021527_outLine
+BABEL_OP3_404_60474_20141029_182816_inLine
+BABEL_OP3_404_60474_20141029_182816_outLine
+BABEL_OP3_404_60477_20150613_223056_inLine
+BABEL_OP3_404_60477_20150613_224002_inLine
+BABEL_OP3_404_60498_20150606_022221_inLine
+BABEL_OP3_404_60498_20150606_022221_outLine
+BABEL_OP3_404_60626_20141028_212539_inLine
+BABEL_OP3_404_60626_20141028_212539_outLine
+BABEL_OP3_404_60706_20141020_215729_inLine
+BABEL_OP3_404_60706_20141020_215729_outLine
+BABEL_OP3_404_61167_20141030_222711_inLine
+BABEL_OP3_404_61167_20141030_222711_outLine
+BABEL_OP3_404_61219_20141025_193634_inLine
+BABEL_OP3_404_61219_20141025_193634_outLine
+BABEL_OP3_404_61678_20141019_201928_inLine
+BABEL_OP3_404_61678_20141019_201928_outLine
+BABEL_OP3_404_61873_20141108_214852_inLine
+BABEL_OP3_404_61873_20141108_214852_outLine
+BABEL_OP3_404_61888_20150504_171019_inLine
+BABEL_OP3_404_61971_20150525_020101_outLine
+BABEL_OP3_404_62155_20150522_032307_inLine
+BABEL_OP3_404_62155_20150522_032307_outLine
+BABEL_OP3_404_62286_20141105_204359_inLine
+BABEL_OP3_404_62286_20141105_204359_outLine
+BABEL_OP3_404_62360_20150517_033230_inLine
+BABEL_OP3_404_62360_20150517_033230_outLine
+BABEL_OP3_404_62456_20141108_202333_inLine
+BABEL_OP3_404_62456_20141108_202333_outLine
+BABEL_OP3_404_62714_20150522_011337_inLine
+BABEL_OP3_404_62714_20150522_011337_outLine
+BABEL_OP3_404_62724_20141130_200827_inLine
+BABEL_OP3_404_62724_20141130_200827_outLine
+BABEL_OP3_404_62734_20141029_221513_inLine
+BABEL_OP3_404_62734_20141029_221513_outLine
+BABEL_OP3_404_62852_20141013_054854_outLine
+BABEL_OP3_404_63081_20141021_032233_inLine
+BABEL_OP3_404_63081_20141021_032233_outLine
+BABEL_OP3_404_63081_20141021_033457_inLine
+BABEL_OP3_404_63081_20141021_033457_outLine
+BABEL_OP3_404_63084_20141130_221452_inLine
+BABEL_OP3_404_63084_20141130_221452_outLine
+BABEL_OP3_404_63425_20141126_054504_inLine
+BABEL_OP3_404_63481_20141020_221014_outLine
+BABEL_OP3_404_63481_20141020_224225_outLine
+BABEL_OP3_404_63670_20141130_050318_inLine
+BABEL_OP3_404_63670_20141130_050318_outLine
+BABEL_OP3_404_63757_20141111_180721_inLine
+BABEL_OP3_404_63757_20141111_180721_outLine
+BABEL_OP3_404_63906_20150525_050310_inLine
+BABEL_OP3_404_63906_20150525_050310_outLine
+BABEL_OP3_404_63999_20150610_041309_inLine
+BABEL_OP3_404_64014_20150503_032745_inLine
+BABEL_OP3_404_64014_20150503_032745_outLine
+BABEL_OP3_404_64722_20150514_034208_outLine
+BABEL_OP3_404_64759_20141014_044027_inLine
+BABEL_OP3_404_64759_20141014_045519_inLine
+BABEL_OP3_404_64796_20141022_055826_inLine
+BABEL_OP3_404_64870_20141108_192546_inLine
+BABEL_OP3_404_64870_20141108_192546_outLine
+BABEL_OP3_404_65561_20141124_060558_inLine
+BABEL_OP3_404_65561_20141124_060558_outLine
+BABEL_OP3_404_65640_20150528_211835_inLine
+BABEL_OP3_404_65640_20150528_211835_outLine
+BABEL_OP3_404_66045_20141117_035937_inLine
+BABEL_OP3_404_66045_20141117_035937_outLine
+BABEL_OP3_404_66177_20150503_202932_inLine
+BABEL_OP3_404_66177_20150503_202932_outLine
+BABEL_OP3_404_66822_20141117_020953_inLine
+BABEL_OP3_404_66822_20141117_020953_outLine
+BABEL_OP3_404_66967_20141008_202611_inLine
+BABEL_OP3_404_66967_20141008_202611_outLine
+BABEL_OP3_404_67152_20150503_201836_inLine
+BABEL_OP3_404_67152_20150503_201836_outLine
+BABEL_OP3_404_67304_20150211_054416_inLine
+BABEL_OP3_404_67304_20150211_054416_outLine
+BABEL_OP3_404_67552_20141126_011955_inLine
+BABEL_OP3_404_67552_20141126_011955_outLine
+BABEL_OP3_404_67842_20141104_051753_inLine
+BABEL_OP3_404_67842_20141104_051753_outLine
+BABEL_OP3_404_68244_20141119_065540_inLine
+BABEL_OP3_404_68244_20141119_065540_outLine
+BABEL_OP3_404_68306_20141126_180315_inLine
+BABEL_OP3_404_68306_20141126_180315_outLine
+BABEL_OP3_404_68385_20141017_031005_inLine
+BABEL_OP3_404_68385_20141017_031005_outLine
+BABEL_OP3_404_68823_20150212_041147_inLine
+BABEL_OP3_404_68823_20150212_041147_outLine
+BABEL_OP3_404_69096_20150512_165126_inLine
+BABEL_OP3_404_69096_20150512_165126_outLine
+BABEL_OP3_404_69107_20141120_010459_inLine
+BABEL_OP3_404_69107_20141120_010459_outLine
+BABEL_OP3_404_69153_20141130_221412_inLine
+BABEL_OP3_404_69153_20141130_221412_outLine
+BABEL_OP3_404_69153_20141130_222842_inLine
+BABEL_OP3_404_69153_20141130_222842_outLine
+BABEL_OP3_404_69474_20141128_051323_outLine
+BABEL_OP3_404_69574_20141006_023156_inLine
+BABEL_OP3_404_69574_20141006_023156_outLine
+BABEL_OP3_404_69578_20141117_003921_inLine
+BABEL_OP3_404_69578_20141117_003921_outLine
+BABEL_OP3_404_69633_20141129_051648_inLine
+BABEL_OP3_404_69633_20141129_051648_outLine
+BABEL_OP3_404_69636_20141126_061322_inLine
+BABEL_OP3_404_69636_20141126_061322_outLine
+BABEL_OP3_404_69885_20150503_011226_inLine
+BABEL_OP3_404_69885_20150503_011226_outLine
+BABEL_OP3_404_69937_20150620_015912_inLine
+BABEL_OP3_404_69964_20150524_015556_inLine
+BABEL_OP3_404_69964_20150524_015556_outLine
+BABEL_OP3_404_69982_20150625_035440_outLine
+BABEL_OP3_404_70121_20141104_202610_inLine
+BABEL_OP3_404_70121_20141104_202610_outLine
+BABEL_OP3_404_70221_20141124_052004_inLine
+BABEL_OP3_404_70221_20141124_052004_outLine
+BABEL_OP3_404_70282_20141111_000251_inLine
+BABEL_OP3_404_70282_20141111_000251_outLine
+BABEL_OP3_404_70460_20150527_015340_inLine
+BABEL_OP3_404_70460_20150527_015340_outLine
+BABEL_OP3_404_70526_20150501_015444_inLine
+BABEL_OP3_404_70526_20150501_015444_outLine
+BABEL_OP3_404_70713_20150527_013058_inLine
+BABEL_OP3_404_70713_20150527_013058_outLine
+BABEL_OP3_404_70794_20141021_185105_inLine
+BABEL_OP3_404_70794_20141021_185105_outLine
+BABEL_OP3_404_71189_20150523_005918_inLine
+BABEL_OP3_404_71189_20150523_005918_outLine
+BABEL_OP3_404_71263_20141119_234747_inLine
+BABEL_OP3_404_71263_20141119_234747_outLine
+BABEL_OP3_404_71278_20150211_052730_inLine
+BABEL_OP3_404_71278_20150211_052730_outLine
+BABEL_OP3_404_71278_20150211_054040_inLine
+BABEL_OP3_404_71278_20150211_054040_outLine
+BABEL_OP3_404_71333_20141102_023503_inLine
+BABEL_OP3_404_71333_20141102_023503_outLine
+BABEL_OP3_404_71401_20150206_070446_inLine
+BABEL_OP3_404_71401_20150206_070446_outLine
+BABEL_OP3_404_71404_20141023_215509_inLine
+BABEL_OP3_404_71404_20141023_215509_outLine
+BABEL_OP3_404_71460_20150206_015309_outLine
+BABEL_OP3_404_71559_20141210_220929_outLine
+BABEL_OP3_404_71566_20141130_035713_inLine
+BABEL_OP3_404_71566_20141130_035713_outLine
+BABEL_OP3_404_71566_20141130_040359_inLine
+BABEL_OP3_404_71566_20141130_040359_outLine
+BABEL_OP3_404_71780_20141105_055543_inLine
+BABEL_OP3_404_71780_20141105_055543_outLine
+BABEL_OP3_404_72319_20150502_041426_inLine
+BABEL_OP3_404_72319_20150502_041426_outLine
+BABEL_OP3_404_72733_20150515_044419_inLine
+BABEL_OP3_404_72733_20150515_044419_outLine
+BABEL_OP3_404_73072_20141012_012029_inLine
+BABEL_OP3_404_73072_20141012_012029_outLine
+BABEL_OP3_404_73119_20141026_232203_inLine
+BABEL_OP3_404_73119_20141026_232203_outLine
+BABEL_OP3_404_73258_20141117_010123_inLine
+BABEL_OP3_404_73258_20141117_010123_outLine
+BABEL_OP3_404_73485_20150512_234636_inLine
+BABEL_OP3_404_73485_20150512_234636_outLine
+BABEL_OP3_404_73964_20150512_205010_inLine
+BABEL_OP3_404_73964_20150512_205010_outLine
+BABEL_OP3_404_74641_20141108_223951_inLine
+BABEL_OP3_404_74641_20141108_223951_outLine
+BABEL_OP3_404_74728_20150503_042547_inLine
+BABEL_OP3_404_74728_20150503_042547_outLine
+BABEL_OP3_404_74799_20141109_222638_inLine
+BABEL_OP3_404_74799_20141109_222638_outLine
+BABEL_OP3_404_75465_20141129_223330_outLine
+BABEL_OP3_404_75869_20150527_230650_inLine
+BABEL_OP3_404_75869_20150527_230650_outLine
+BABEL_OP3_404_75975_20150127_051140_outLine
+BABEL_OP3_404_76126_20141201_202238_inLine
+BABEL_OP3_404_76126_20141201_202238_outLine
+BABEL_OP3_404_76238_20141129_223455_inLine
+BABEL_OP3_404_76238_20141129_223455_outLine
+BABEL_OP3_404_76372_20150601_014341_inLine
+BABEL_OP3_404_76372_20150601_014341_outLine
+BABEL_OP3_404_76437_20141019_202715_inLine
+BABEL_OP3_404_76437_20141019_202715_outLine
+BABEL_OP3_404_76444_20141127_032124_inLine
+BABEL_OP3_404_76444_20141127_032124_outLine
+BABEL_OP3_404_76482_20150618_063131_outLine
+BABEL_OP3_404_76683_20141110_191551_inLine
+BABEL_OP3_404_76683_20141110_191551_outLine
+BABEL_OP3_404_76837_20150124_222250_outLine
+BABEL_OP3_404_76970_20150625_191722_inLine
+BABEL_OP3_404_77126_20141022_202348_inLine
+BABEL_OP3_404_77126_20141022_202348_outLine
+BABEL_OP3_404_77146_20141019_060916_inLine
+BABEL_OP3_404_77242_20150612_024655_inLine
+BABEL_OP3_404_77391_20141026_222314_inLine
+BABEL_OP3_404_77391_20141026_222314_outLine
+BABEL_OP3_404_77427_20141030_192713_inLine
+BABEL_OP3_404_77427_20141030_192713_outLine
+BABEL_OP3_404_77567_20141021_021210_inLine
+BABEL_OP3_404_77567_20141021_021210_outLine
+BABEL_OP3_404_77730_20141014_201059_inLine
+BABEL_OP3_404_77730_20141014_201059_outLine
+BABEL_OP3_404_77803_20141020_030844_inLine
+BABEL_OP3_404_77803_20141020_030844_outLine
+BABEL_OP3_404_77990_20141024_215822_inLine
+BABEL_OP3_404_77990_20141024_215822_outLine
+BABEL_OP3_404_78016_20141029_233059_inLine
+BABEL_OP3_404_78016_20141029_233059_outLine
+BABEL_OP3_404_78254_20141025_202742_inLine
+BABEL_OP3_404_78254_20141025_202742_outLine
+BABEL_OP3_404_78254_20141025_204922_inLine
+BABEL_OP3_404_78254_20141025_204922_outLine
+BABEL_OP3_404_78454_20141115_043455_inLine
+BABEL_OP3_404_78749_20150620_025728_inLine
+BABEL_OP3_404_78749_20150620_025728_outLine
+BABEL_OP3_404_78976_20141025_183704_inLine
+BABEL_OP3_404_78976_20141025_183704_outLine
+BABEL_OP3_404_79190_20141108_232204_inLine
+BABEL_OP3_404_79190_20141108_232204_outLine
+BABEL_OP3_404_79590_20141129_025808_outLine
+BABEL_OP3_404_79751_20141101_232250_inLine
+BABEL_OP3_404_79751_20141101_232250_outLine
+BABEL_OP3_404_79820_20141104_045340_inLine
+BABEL_OP3_404_79820_20141104_045340_outLine
+BABEL_OP3_404_79858_20141015_200446_inLine
+BABEL_OP3_404_79898_20150620_022648_inLine
+BABEL_OP3_404_79898_20150620_022648_outLine
+BABEL_OP3_404_79898_20150620_024014_inLine
+BABEL_OP3_404_79898_20150620_024014_outLine
+BABEL_OP3_404_80069_20150614_233606_inLine
+BABEL_OP3_404_80069_20150614_233606_outLine
+BABEL_OP3_404_80306_20141119_003833_inLine
+BABEL_OP3_404_80306_20141119_003833_outLine
+BABEL_OP3_404_80306_20141119_005121_inLine
+BABEL_OP3_404_80306_20141119_005121_outLine
+BABEL_OP3_404_80439_20141026_005410_inLine
+BABEL_OP3_404_80439_20141026_005410_outLine
+BABEL_OP3_404_80559_20141022_010255_inLine
+BABEL_OP3_404_80655_20150525_221544_inLine
+BABEL_OP3_404_80655_20150525_221544_outLine
+BABEL_OP3_404_80897_20141119_233718_inLine
+BABEL_OP3_404_80897_20141119_233718_outLine
+BABEL_OP3_404_81149_20150525_003741_inLine
+BABEL_OP3_404_81149_20150525_003741_outLine
+BABEL_OP3_404_81213_20141102_205052_inLine
+BABEL_OP3_404_81213_20141102_205052_outLine
+BABEL_OP3_404_81229_20141117_041745_inLine
+BABEL_OP3_404_81229_20141117_041745_outLine
+BABEL_OP3_404_81427_20141030_015136_inLine
+BABEL_OP3_404_81427_20141030_015136_outLine
+BABEL_OP3_404_81854_20150610_060437_inLine
+BABEL_OP3_404_82089_20141117_045302_inLine
+BABEL_OP3_404_82089_20141117_045302_outLine
+BABEL_OP3_404_82303_20150614_024236_inLine
+BABEL_OP3_404_82303_20150614_024236_outLine
+BABEL_OP3_404_82473_20141026_060037_inLine
+BABEL_OP3_404_82473_20141026_060037_outLine
+BABEL_OP3_404_82626_20150615_014517_inLine
+BABEL_OP3_404_82637_20141021_010105_inLine
+BABEL_OP3_404_82637_20141021_010105_outLine
+BABEL_OP3_404_82742_20141201_234306_inLine
+BABEL_OP3_404_82742_20141201_234306_outLine
+BABEL_OP3_404_82863_20141119_044230_inLine
+BABEL_OP3_404_82863_20141119_044230_outLine
+BABEL_OP3_404_83238_20141119_180953_inLine
+BABEL_OP3_404_83238_20141119_180953_outLine
+BABEL_OP3_404_83366_20141120_192208_inLine
+BABEL_OP3_404_83366_20141120_192208_outLine
+BABEL_OP3_404_83651_20141102_170912_inLine
+BABEL_OP3_404_83651_20141102_170912_outLine
+BABEL_OP3_404_83771_20150604_012300_outLine
+BABEL_OP3_404_83851_20141028_203735_inLine
+BABEL_OP3_404_83851_20141028_203735_outLine
+BABEL_OP3_404_83929_20141018_184023_inLine
+BABEL_OP3_404_83929_20141018_184023_outLine
+BABEL_OP3_404_83974_20150617_022055_inLine
+BABEL_OP3_404_84055_20150504_002015_inLine
+BABEL_OP3_404_84055_20150504_002015_outLine
+BABEL_OP3_404_84061_20141030_205021_inLine
+BABEL_OP3_404_84061_20141030_205021_outLine
+BABEL_OP3_404_84125_20141018_023340_inLine
+BABEL_OP3_404_84125_20141018_023340_outLine
+BABEL_OP3_404_84458_20141130_053628_outLine
+BABEL_OP3_404_84815_20141127_011952_inLine
+BABEL_OP3_404_84815_20141127_013345_inLine
+BABEL_OP3_404_85047_20141117_014630_inLine
+BABEL_OP3_404_85047_20141117_014630_outLine
+BABEL_OP3_404_85048_20141127_023704_inLine
+BABEL_OP3_404_85048_20141127_023704_outLine
+BABEL_OP3_404_85254_20150620_035606_inLine
+BABEL_OP3_404_85254_20150620_035606_outLine
+BABEL_OP3_404_85322_20141008_235518_inLine
+BABEL_OP3_404_85322_20141008_235518_outLine
+BABEL_OP3_404_85340_20141103_022707_inLine
+BABEL_OP3_404_85340_20141103_022707_outLine
+BABEL_OP3_404_85651_20141211_032650_inLine
+BABEL_OP3_404_85651_20141211_032650_outLine
+BABEL_OP3_404_86472_20141201_011325_inLine
+BABEL_OP3_404_86472_20141201_011325_outLine
+BABEL_OP3_404_86597_20150612_170328_inLine
+BABEL_OP3_404_86597_20150612_170328_outLine
+BABEL_OP3_404_86635_20141127_204158_inLine
+BABEL_OP3_404_86635_20141127_204158_outLine
+BABEL_OP3_404_86722_20141029_192140_inLine
+BABEL_OP3_404_86722_20141029_192140_outLine
+BABEL_OP3_404_87074_20141105_190107_outLine
+BABEL_OP3_404_87470_20141114_214639_inLine
+BABEL_OP3_404_87470_20141114_214639_outLine
+BABEL_OP3_404_87629_20141127_020403_inLine
+BABEL_OP3_404_87629_20141127_020403_outLine
+BABEL_OP3_404_87777_20141127_040747_inLine
+BABEL_OP3_404_87777_20141127_040747_outLine
+BABEL_OP3_404_87871_20141201_023608_inLine
+BABEL_OP3_404_87871_20141201_023608_outLine
+BABEL_OP3_404_87921_20141201_023029_inLine
+BABEL_OP3_404_87921_20141201_023029_outLine
+BABEL_OP3_404_88260_20141103_234824_inLine
+BABEL_OP3_404_88260_20141103_234824_outLine
+BABEL_OP3_404_88445_20141119_043713_inLine
+BABEL_OP3_404_88445_20141119_043713_outLine
+BABEL_OP3_404_88661_20141127_025208_inLine
+BABEL_OP3_404_88661_20141127_025208_outLine
+BABEL_OP3_404_88669_20141119_000147_inLine
+BABEL_OP3_404_88669_20141119_000147_outLine
+BABEL_OP3_404_88783_20141201_045305_inLine
+BABEL_OP3_404_88783_20141201_045305_outLine
+BABEL_OP3_404_88873_20141028_190127_inLine
+BABEL_OP3_404_88873_20141028_190127_outLine
+BABEL_OP3_404_89045_20141022_193202_inLine
+BABEL_OP3_404_89045_20141022_193202_outLine
+BABEL_OP3_404_89330_20150616_002908_inLine
+BABEL_OP3_404_89330_20150616_002908_outLine
+BABEL_OP3_404_89372_20141010_000950_inLine
+BABEL_OP3_404_89372_20141010_000950_outLine
+BABEL_OP3_404_89650_20150220_222402_inLine
+BABEL_OP3_404_89650_20150220_222402_outLine
+BABEL_OP3_404_89650_20150220_224606_inLine
+BABEL_OP3_404_89650_20150220_224606_outLine
+BABEL_OP3_404_89665_20141103_202723_inLine
+BABEL_OP3_404_89665_20141103_202723_outLine
+BABEL_OP3_404_89943_20141105_211847_outLine
+BABEL_OP3_404_90347_20141119_012016_inLine
+BABEL_OP3_404_90347_20141119_012016_outLine
+BABEL_OP3_404_90760_20150611_151739_inLine
+BABEL_OP3_404_90760_20150611_151739_outLine
+BABEL_OP3_404_90832_20150616_012728_inLine
+BABEL_OP3_404_90832_20150616_012728_outLine
+BABEL_OP3_404_90930_20150119_021352_inLine
+BABEL_OP3_404_90930_20150119_021352_outLine
+BABEL_OP3_404_91383_20150618_035815_inLine
+BABEL_OP3_404_91463_20141116_023036_inLine
+BABEL_OP3_404_91463_20141116_023036_outLine
+BABEL_OP3_404_91475_20150614_034536_inLine
+BABEL_OP3_404_91581_20141129_045608_inLine
+BABEL_OP3_404_91581_20141129_045608_outLine
+BABEL_OP3_404_91581_20141129_050730_inLine
+BABEL_OP3_404_91581_20141129_050730_outLine
+BABEL_OP3_404_91593_20150611_021825_inLine
+BABEL_OP3_404_91593_20150611_021825_outLine
+BABEL_OP3_404_91825_20141009_181224_inLine
+BABEL_OP3_404_91825_20141009_181224_outLine
+BABEL_OP3_404_91825_20141009_183843_inLine
+BABEL_OP3_404_91825_20141009_183843_outLine
+BABEL_OP3_404_91884_20150503_022858_inLine
+BABEL_OP3_404_91884_20150503_022858_outLine
+BABEL_OP3_404_91888_20150512_191012_inLine
+BABEL_OP3_404_91888_20150512_191012_outLine
+BABEL_OP3_404_91891_20141129_005825_inLine
+BABEL_OP3_404_91891_20141129_005825_outLine
+BABEL_OP3_404_91944_20141022_021002_inLine
+BABEL_OP3_404_91971_20150217_041455_inLine
+BABEL_OP3_404_91971_20150217_041455_outLine
+BABEL_OP3_404_91977_20141122_230420_outLine
+BABEL_OP3_404_92176_20141119_195614_inLine
+BABEL_OP3_404_92176_20141119_195614_outLine
+BABEL_OP3_404_92281_20150625_185123_inLine
+BABEL_OP3_404_92698_20141117_072302_inLine
+BABEL_OP3_404_92698_20141117_072302_outLine
+BABEL_OP3_404_92736_20141201_011442_inLine
+BABEL_OP3_404_92736_20141201_011442_outLine
+BABEL_OP3_404_92757_20150525_200048_inLine
+BABEL_OP3_404_92757_20150525_200048_outLine
+BABEL_OP3_404_92792_20150503_182854_outLine
+BABEL_OP3_404_92792_20150525_025523_outLine
+BABEL_OP3_404_92942_20141120_022830_inLine
+BABEL_OP3_404_92942_20141120_022830_outLine
+BABEL_OP3_404_93007_20150615_051230_inLine
+BABEL_OP3_404_93007_20150615_051230_outLine
+BABEL_OP3_404_93858_20150611_043732_inLine
+BABEL_OP3_404_94002_20141119_015307_inLine
+BABEL_OP3_404_94002_20141119_015307_outLine
+BABEL_OP3_404_94025_20141129_180207_inLine
+BABEL_OP3_404_94025_20141129_180207_outLine
+BABEL_OP3_404_94333_20141020_024439_outLine
+BABEL_OP3_404_94487_20150518_005132_outLine
+BABEL_OP3_404_94869_20141007_194254_inLine
+BABEL_OP3_404_94869_20141007_194254_outLine
+BABEL_OP3_404_95077_20141201_055702_outLine
+BABEL_OP3_404_95269_20141105_221810_inLine
+BABEL_OP3_404_95269_20141105_221810_outLine
+BABEL_OP3_404_95338_20150610_211203_inLine
+BABEL_OP3_404_95338_20150610_211203_outLine
+BABEL_OP3_404_95399_20141119_001023_inLine
+BABEL_OP3_404_95399_20141119_001023_outLine
+BABEL_OP3_404_95583_20141019_010741_inLine
+BABEL_OP3_404_95583_20141019_010741_outLine
+BABEL_OP3_404_96059_20150524_042224_outLine
+BABEL_OP3_404_96205_20141119_033053_inLine
+BABEL_OP3_404_96205_20141119_033053_outLine
+BABEL_OP3_404_96205_20141119_034909_inLine
+BABEL_OP3_404_96205_20141119_034909_outLine
+BABEL_OP3_404_96247_20150526_202623_outLine
+BABEL_OP3_404_96376_20150503_033706_inLine
+BABEL_OP3_404_96376_20150503_033706_outLine
+BABEL_OP3_404_96504_20141103_031329_inLine
+BABEL_OP3_404_96504_20141103_031329_outLine
+BABEL_OP3_404_96690_20141117_053054_inLine
+BABEL_OP3_404_96690_20141117_053054_outLine
+BABEL_OP3_404_96808_20150609_034129_inLine
+BABEL_OP3_404_97097_20150601_042649_outLine
+BABEL_OP3_404_97136_20150528_011250_inLine
+BABEL_OP3_404_97136_20150528_011250_outLine
+BABEL_OP3_404_97557_20141119_230718_inLine
+BABEL_OP3_404_97557_20141119_230718_outLine
+BABEL_OP3_404_97588_20141018_234016_inLine
+BABEL_OP3_404_97588_20141018_234016_outLine
+BABEL_OP3_404_97588_20141018_235425_inLine
+BABEL_OP3_404_97588_20141018_235425_outLine
+BABEL_OP3_404_97896_20141116_221329_inLine
+BABEL_OP3_404_97896_20141116_221329_outLine
+BABEL_OP3_404_97911_20150613_195820_outLine
+BABEL_OP3_404_97988_20141201_030306_inLine
+BABEL_OP3_404_97988_20141201_030306_outLine
+BABEL_OP3_404_98165_20141030_214051_inLine
+BABEL_OP3_404_98165_20141030_214051_outLine
+BABEL_OP3_404_98192_20150617_021906_outLine
+BABEL_OP3_404_98489_20141102_002030_inLine
+BABEL_OP3_404_98489_20141102_004054_inLine
+BABEL_OP3_404_98678_20150528_021605_inLine
+BABEL_OP3_404_98678_20150528_023029_inLine
+BABEL_OP3_404_98888_20141113_212715_inLine
+BABEL_OP3_404_98888_20141113_212715_outLine
+BABEL_OP3_404_99202_20141108_210814_inLine
+BABEL_OP3_404_99202_20141108_210814_outLine
+BABEL_OP3_404_99289_20150521_220314_inLine
+BABEL_OP3_404_99289_20150521_220314_outLine
+BABEL_OP3_404_99289_20150521_222144_inLine
+BABEL_OP3_404_99289_20150521_222144_outLine
+BABEL_OP3_404_99594_20141105_194545_inLine
+BABEL_OP3_404_99594_20141105_194545_outLine
+BABEL_OP3_404_99718_20141019_051850_inLine
+BABEL_OP3_404_99718_20141019_051850_outLine
+BABEL_OP3_404_99718_20141019_053305_inLine
+BABEL_OP3_404_99718_20141019_053305_outLine
+BABEL_OP3_404_99732_20141130_232553_inLine
+BABEL_OP3_404_99732_20141130_232553_outLine
+BABEL_OP3_404_99813_20141120_025129_inLine
+BABEL_OP3_404_99813_20141120_025129_outLine
+BABEL_OP3_404_99920_20141022_052026_inLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/training.list b/egs/babel/s5d/conf/lists/404-georgian/training.list
new file mode 100644
index 00000000000..efc0afb8219
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/training.list
@@ -0,0 +1,518 @@
+BABEL_OP3_404_10019_20141101_191932_inLine
+BABEL_OP3_404_10019_20141101_191932_outLine
+BABEL_OP3_404_10416_20141117_064700_inLine
+BABEL_OP3_404_10416_20141117_064700_outLine
+BABEL_OP3_404_10647_20150514_001106_inLine
+BABEL_OP3_404_10647_20150514_001106_outLine
+BABEL_OP3_404_10974_20141119_205506_inLine
+BABEL_OP3_404_10974_20141119_205506_outLine
+BABEL_OP3_404_11663_20141118_032146_inLine
+BABEL_OP3_404_11663_20141118_032146_outLine
+BABEL_OP3_404_11673_20141023_035438_inLine
+BABEL_OP3_404_11673_20141023_035438_outLine
+BABEL_OP3_404_11681_20141107_190101_inLine
+BABEL_OP3_404_11681_20141107_190101_outLine
+BABEL_OP3_404_12242_20141028_021853_inLine
+BABEL_OP3_404_12242_20141028_021853_outLine
+BABEL_OP3_404_13030_20141101_200709_inLine
+BABEL_OP3_404_13030_20141101_200709_outLine
+BABEL_OP3_404_13178_20141129_192909_inLine
+BABEL_OP3_404_13178_20141129_192909_outLine
+BABEL_OP3_404_13324_20141022_200257_inLine
+BABEL_OP3_404_13324_20141022_200257_outLine
+BABEL_OP3_404_13664_20141012_013523_inLine
+BABEL_OP3_404_13664_20141012_013523_outLine
+BABEL_OP3_404_13709_20150512_015216_inLine
+BABEL_OP3_404_13709_20150512_015216_outLine
+BABEL_OP3_404_14137_20141025_202817_inLine
+BABEL_OP3_404_14137_20141025_202817_outLine
+BABEL_OP3_404_14229_20141029_200136_inLine
+BABEL_OP3_404_14229_20141029_200136_outLine
+BABEL_OP3_404_14237_20141006_171921_inLine
+BABEL_OP3_404_14237_20141006_171921_outLine
+BABEL_OP3_404_14440_20141127_213106_inLine
+BABEL_OP3_404_14440_20141127_213106_outLine
+BABEL_OP3_404_14807_20141110_231934_inLine
+BABEL_OP3_404_14807_20141110_231934_outLine
+BABEL_OP3_404_14875_20141026_230227_inLine
+BABEL_OP3_404_14875_20141026_230227_outLine
+BABEL_OP3_404_14899_20141022_202217_inLine
+BABEL_OP3_404_14899_20141022_202217_outLine
+BABEL_OP3_404_14929_20141129_192841_inLine
+BABEL_OP3_404_14929_20141129_192841_outLine
+BABEL_OP3_404_15382_20141130_213942_inLine
+BABEL_OP3_404_15382_20141130_213942_outLine
+BABEL_OP3_404_15848_20141006_231138_inLine
+BABEL_OP3_404_15848_20141006_231138_outLine
+BABEL_OP3_404_15869_20150218_225936_inLine
+BABEL_OP3_404_15869_20150218_225936_outLine
+BABEL_OP3_404_16149_20141010_173548_inLine
+BABEL_OP3_404_16149_20141010_173548_outLine
+BABEL_OP3_404_16467_20141130_014316_inLine
+BABEL_OP3_404_16467_20141130_014316_outLine
+BABEL_OP3_404_16467_20141130_015010_inLine
+BABEL_OP3_404_16467_20141130_015010_outLine
+BABEL_OP3_404_17113_20150611_050102_inLine
+BABEL_OP3_404_17113_20150611_050102_outLine
+BABEL_OP3_404_17280_20141103_190330_inLine
+BABEL_OP3_404_17280_20141103_190330_outLine
+BABEL_OP3_404_17615_20141201_025917_inLine
+BABEL_OP3_404_17615_20141201_025917_outLine
+BABEL_OP3_404_19134_20141120_053128_inLine
+BABEL_OP3_404_19134_20141120_053128_outLine
+BABEL_OP3_404_19703_20141027_004315_inLine
+BABEL_OP3_404_19703_20141027_004315_outLine
+BABEL_OP3_404_20133_20141010_195231_inLine
+BABEL_OP3_404_20133_20141010_195231_outLine
+BABEL_OP3_404_20985_20141126_183236_inLine
+BABEL_OP3_404_20985_20141126_183236_outLine
+BABEL_OP3_404_21004_20141201_035831_inLine
+BABEL_OP3_404_21004_20141201_035831_outLine
+BABEL_OP3_404_22280_20141111_020522_inLine
+BABEL_OP3_404_22280_20141111_020522_outLine
+BABEL_OP3_404_23046_20141031_030755_inLine
+BABEL_OP3_404_23046_20141031_030755_outLine
+BABEL_OP3_404_23505_20141021_032033_inLine
+BABEL_OP3_404_23505_20141021_032033_outLine
+BABEL_OP3_404_23731_20141130_033602_inLine
+BABEL_OP3_404_23731_20141130_033602_outLine
+BABEL_OP3_404_23980_20141106_225951_inLine
+BABEL_OP3_404_23980_20141106_225951_outLine
+BABEL_OP3_404_24270_20141111_012902_inLine
+BABEL_OP3_404_24270_20141111_012902_outLine
+BABEL_OP3_404_24470_20141111_184651_inLine
+BABEL_OP3_404_24470_20141111_184651_outLine
+BABEL_OP3_404_24470_20141111_190229_inLine
+BABEL_OP3_404_24470_20141111_190229_outLine
+BABEL_OP3_404_24532_20141007_211325_inLine
+BABEL_OP3_404_24532_20141007_211325_outLine
+BABEL_OP3_404_24589_20141031_020641_inLine
+BABEL_OP3_404_24589_20141031_020641_outLine
+BABEL_OP3_404_24679_20141018_015615_inLine
+BABEL_OP3_404_24679_20141018_015615_outLine
+BABEL_OP3_404_24982_20141102_021352_inLine
+BABEL_OP3_404_24982_20141102_021352_outLine
+BABEL_OP3_404_26388_20141026_014207_inLine
+BABEL_OP3_404_26388_20141026_014207_outLine
+BABEL_OP3_404_27042_20141201_215107_inLine
+BABEL_OP3_404_27042_20141201_215107_outLine
+BABEL_OP3_404_28303_20141028_182204_inLine
+BABEL_OP3_404_28303_20141028_182204_outLine
+BABEL_OP3_404_28522_20141124_222758_inLine
+BABEL_OP3_404_28522_20141124_222758_outLine
+BABEL_OP3_404_28538_20141119_005526_inLine
+BABEL_OP3_404_28538_20141119_005526_outLine
+BABEL_OP3_404_28871_20141019_181913_inLine
+BABEL_OP3_404_28871_20141019_181913_outLine
+BABEL_OP3_404_29039_20141128_035839_inLine
+BABEL_OP3_404_29039_20141128_035839_outLine
+BABEL_OP3_404_29208_20141106_013309_inLine
+BABEL_OP3_404_29208_20141106_013309_outLine
+BABEL_OP3_404_30098_20150610_150504_inLine
+BABEL_OP3_404_30098_20150610_150504_outLine
+BABEL_OP3_404_30432_20141126_052839_inLine
+BABEL_OP3_404_30432_20141126_052839_outLine
+BABEL_OP3_404_30461_20150620_020316_inLine
+BABEL_OP3_404_30461_20150620_020316_outLine
+BABEL_OP3_404_31624_20141105_214349_inLine
+BABEL_OP3_404_31624_20141105_214349_outLine
+BABEL_OP3_404_31979_20141106_000523_inLine
+BABEL_OP3_404_31979_20141106_000523_outLine
+BABEL_OP3_404_31992_20141014_221817_inLine
+BABEL_OP3_404_31992_20141014_221817_outLine
+BABEL_OP3_404_32122_20141115_022841_inLine
+BABEL_OP3_404_32122_20141115_022841_outLine
+BABEL_OP3_404_32287_20150210_060823_inLine
+BABEL_OP3_404_32287_20150210_060823_outLine
+BABEL_OP3_404_32708_20141106_032826_inLine
+BABEL_OP3_404_32708_20141106_032826_outLine
+BABEL_OP3_404_32727_20141128_203500_inLine
+BABEL_OP3_404_32727_20141128_203500_outLine
+BABEL_OP3_404_32727_20141128_204751_inLine
+BABEL_OP3_404_32727_20141128_204751_outLine
+BABEL_OP3_404_33355_20141019_032024_inLine
+BABEL_OP3_404_33355_20141019_032024_outLine
+BABEL_OP3_404_33355_20141019_034109_inLine
+BABEL_OP3_404_33355_20141019_034109_outLine
+BABEL_OP3_404_33704_20141207_073436_inLine
+BABEL_OP3_404_33704_20141207_073436_outLine
+BABEL_OP3_404_34679_20141102_052808_inLine
+BABEL_OP3_404_34679_20141102_052808_outLine
+BABEL_OP3_404_34688_20141009_073303_inLine
+BABEL_OP3_404_34688_20141009_073303_outLine
+BABEL_OP3_404_35143_20141130_181111_inLine
+BABEL_OP3_404_35143_20141130_181111_outLine
+BABEL_OP3_404_37064_20141102_063308_inLine
+BABEL_OP3_404_37064_20141102_063308_outLine
+BABEL_OP3_404_37281_20141119_053453_inLine
+BABEL_OP3_404_37281_20141119_053453_outLine
+BABEL_OP3_404_37598_20141119_045926_inLine
+BABEL_OP3_404_37598_20141119_045926_outLine
+BABEL_OP3_404_37682_20141101_221445_inLine
+BABEL_OP3_404_37682_20141101_221445_outLine
+BABEL_OP3_404_37853_20150602_030625_inLine
+BABEL_OP3_404_37853_20150602_030625_outLine
+BABEL_OP3_404_38588_20141118_163844_inLine
+BABEL_OP3_404_38588_20141118_163844_outLine
+BABEL_OP3_404_40557_20141127_200639_inLine
+BABEL_OP3_404_40557_20141127_200639_outLine
+BABEL_OP3_404_40713_20141028_221207_inLine
+BABEL_OP3_404_40713_20141028_221207_outLine
+BABEL_OP3_404_40939_20150210_212748_inLine
+BABEL_OP3_404_40939_20150210_212748_outLine
+BABEL_OP3_404_41100_20141021_022126_inLine
+BABEL_OP3_404_41100_20141021_022126_outLine
+BABEL_OP3_404_41609_20141009_013405_inLine
+BABEL_OP3_404_41609_20141009_013405_outLine
+BABEL_OP3_404_41680_20141012_040411_inLine
+BABEL_OP3_404_41680_20141012_040411_outLine
+BABEL_OP3_404_41920_20141008_040539_inLine
+BABEL_OP3_404_41920_20141008_040539_outLine
+BABEL_OP3_404_41958_20141029_212755_inLine
+BABEL_OP3_404_41958_20141029_212755_outLine
+BABEL_OP3_404_42877_20150212_052937_inLine
+BABEL_OP3_404_42877_20150212_052937_outLine
+BABEL_OP3_404_43368_20141031_010629_inLine
+BABEL_OP3_404_43368_20141031_010629_outLine
+BABEL_OP3_404_44114_20150614_012319_inLine
+BABEL_OP3_404_44114_20150614_012319_outLine
+BABEL_OP3_404_44477_20141201_180604_inLine
+BABEL_OP3_404_44477_20141201_180604_outLine
+BABEL_OP3_404_44847_20141130_221248_inLine
+BABEL_OP3_404_44847_20141130_221248_outLine
+BABEL_OP3_404_45121_20150609_055234_inLine
+BABEL_OP3_404_45121_20150609_055234_outLine
+BABEL_OP3_404_45560_20141012_030417_inLine
+BABEL_OP3_404_45560_20141012_030417_outLine
+BABEL_OP3_404_46169_20141130_224339_inLine
+BABEL_OP3_404_46169_20141130_224339_outLine
+BABEL_OP3_404_46268_20141019_032022_inLine
+BABEL_OP3_404_46268_20141019_032022_outLine
+BABEL_OP3_404_46550_20141105_072519_inLine
+BABEL_OP3_404_46550_20141105_072519_outLine
+BABEL_OP3_404_46625_20141011_040505_inLine
+BABEL_OP3_404_46625_20141011_040505_outLine
+BABEL_OP3_404_46681_20141021_040451_inLine
+BABEL_OP3_404_46681_20141021_040451_outLine
+BABEL_OP3_404_46881_20141012_020055_inLine
+BABEL_OP3_404_46881_20141012_020055_outLine
+BABEL_OP3_404_46976_20141107_183806_inLine
+BABEL_OP3_404_46976_20141107_183806_outLine
+BABEL_OP3_404_47270_20150512_053415_inLine
+BABEL_OP3_404_47270_20150512_053415_outLine
+BABEL_OP3_404_47802_20141110_200430_inLine
+BABEL_OP3_404_47802_20141110_200430_outLine
+BABEL_OP3_404_48243_20141023_200903_inLine
+BABEL_OP3_404_48243_20141023_200903_outLine
+BABEL_OP3_404_48844_20141020_065414_inLine
+BABEL_OP3_404_48844_20141020_065414_outLine
+BABEL_OP3_404_49197_20141117_024730_inLine
+BABEL_OP3_404_49197_20141117_024730_outLine
+BABEL_OP3_404_49768_20141026_022902_inLine
+BABEL_OP3_404_49768_20141026_022902_outLine
+BABEL_OP3_404_49902_20141101_175534_inLine
+BABEL_OP3_404_49902_20141101_175534_outLine
+BABEL_OP3_404_49907_20141103_050534_inLine
+BABEL_OP3_404_49907_20141103_050534_outLine
+BABEL_OP3_404_50175_20141021_025726_inLine
+BABEL_OP3_404_50175_20141021_025726_outLine
+BABEL_OP3_404_50745_20150513_162805_inLine
+BABEL_OP3_404_50745_20150513_162805_outLine
+BABEL_OP3_404_51015_20141123_193824_inLine
+BABEL_OP3_404_51015_20141123_193824_outLine
+BABEL_OP3_404_52246_20141118_035022_inLine
+BABEL_OP3_404_52246_20141118_035022_outLine
+BABEL_OP3_404_52246_20141118_040850_inLine
+BABEL_OP3_404_52246_20141118_040850_outLine
+BABEL_OP3_404_52301_20141009_051739_inLine
+BABEL_OP3_404_52301_20141009_051739_outLine
+BABEL_OP3_404_52301_20141009_054049_inLine
+BABEL_OP3_404_52301_20141009_054049_outLine
+BABEL_OP3_404_52490_20141016_020323_inLine
+BABEL_OP3_404_52490_20141016_020323_outLine
+BABEL_OP3_404_52725_20150522_222730_inLine
+BABEL_OP3_404_52725_20150522_222730_outLine
+BABEL_OP3_404_54104_20141008_214620_inLine
+BABEL_OP3_404_54104_20141008_214620_outLine
+BABEL_OP3_404_54160_20141009_180704_inLine
+BABEL_OP3_404_54160_20141009_180704_outLine
+BABEL_OP3_404_54160_20141009_184719_inLine
+BABEL_OP3_404_54160_20141009_184719_outLine
+BABEL_OP3_404_54160_20141009_185557_inLine
+BABEL_OP3_404_54160_20141009_185557_outLine
+BABEL_OP3_404_54405_20141117_054820_inLine
+BABEL_OP3_404_54405_20141117_054820_outLine
+BABEL_OP3_404_54744_20141015_012011_inLine
+BABEL_OP3_404_54744_20141015_012011_outLine
+BABEL_OP3_404_55259_20141029_225631_inLine
+BABEL_OP3_404_55259_20141029_225631_outLine
+BABEL_OP3_404_56213_20141201_000837_inLine
+BABEL_OP3_404_56213_20141201_000837_outLine
+BABEL_OP3_404_57654_20141023_235628_inLine
+BABEL_OP3_404_57654_20141023_235628_outLine
+BABEL_OP3_404_57678_20141104_023128_inLine
+BABEL_OP3_404_57678_20141104_023128_outLine
+BABEL_OP3_404_57919_20150127_041057_inLine
+BABEL_OP3_404_57919_20150127_041057_outLine
+BABEL_OP3_404_58103_20141030_002209_inLine
+BABEL_OP3_404_58103_20141030_002209_outLine
+BABEL_OP3_404_59078_20141111_004941_inLine
+BABEL_OP3_404_59078_20141111_004941_outLine
+BABEL_OP3_404_59262_20141130_212633_inLine
+BABEL_OP3_404_59262_20141130_212633_outLine
+BABEL_OP3_404_59720_20141029_204612_inLine
+BABEL_OP3_404_59720_20141029_204612_outLine
+BABEL_OP3_404_60026_20141008_051633_inLine
+BABEL_OP3_404_60026_20141008_051633_outLine
+BABEL_OP3_404_60474_20141029_182816_inLine
+BABEL_OP3_404_60474_20141029_182816_outLine
+BABEL_OP3_404_60626_20141028_212539_inLine
+BABEL_OP3_404_60626_20141028_212539_outLine
+BABEL_OP3_404_61167_20141030_222711_inLine
+BABEL_OP3_404_61167_20141030_222711_outLine
+BABEL_OP3_404_61219_20141025_193634_inLine
+BABEL_OP3_404_61219_20141025_193634_outLine
+BABEL_OP3_404_61225_20141009_174003_inLine
+BABEL_OP3_404_61225_20141009_174003_outLine
+BABEL_OP3_404_61678_20141019_201928_inLine
+BABEL_OP3_404_61678_20141019_201928_outLine
+BABEL_OP3_404_61873_20141108_214852_inLine
+BABEL_OP3_404_61873_20141108_214852_outLine
+BABEL_OP3_404_62155_20150522_032307_inLine
+BABEL_OP3_404_62155_20150522_032307_outLine
+BABEL_OP3_404_62286_20141105_204359_inLine
+BABEL_OP3_404_62286_20141105_204359_outLine
+BABEL_OP3_404_62456_20141108_202333_inLine
+BABEL_OP3_404_62456_20141108_202333_outLine
+BABEL_OP3_404_62714_20150522_011337_inLine
+BABEL_OP3_404_62714_20150522_011337_outLine
+BABEL_OP3_404_62734_20141029_221513_inLine
+BABEL_OP3_404_62734_20141029_221513_outLine
+BABEL_OP3_404_63081_20141021_032233_inLine
+BABEL_OP3_404_63081_20141021_032233_outLine
+BABEL_OP3_404_63081_20141021_033457_inLine
+BABEL_OP3_404_63081_20141021_033457_outLine
+BABEL_OP3_404_63084_20141130_221452_inLine
+BABEL_OP3_404_63084_20141130_221452_outLine
+BABEL_OP3_404_63220_20141127_033605_inLine
+BABEL_OP3_404_63220_20141127_033605_outLine
+BABEL_OP3_404_63757_20141111_180721_inLine
+BABEL_OP3_404_63757_20141111_180721_outLine
+BABEL_OP3_404_64494_20141026_203549_inLine
+BABEL_OP3_404_64494_20141026_203549_outLine
+BABEL_OP3_404_64768_20141027_201818_inLine
+BABEL_OP3_404_64768_20141027_201818_outLine
+BABEL_OP3_404_64870_20141108_192546_inLine
+BABEL_OP3_404_64870_20141108_192546_outLine
+BABEL_OP3_404_66045_20141117_035937_inLine
+BABEL_OP3_404_66045_20141117_035937_outLine
+BABEL_OP3_404_66177_20150503_202932_inLine
+BABEL_OP3_404_66177_20150503_202932_outLine
+BABEL_OP3_404_66822_20141117_020953_inLine
+BABEL_OP3_404_66822_20141117_020953_outLine
+BABEL_OP3_404_66916_20141022_000731_inLine
+BABEL_OP3_404_66916_20141022_000731_outLine
+BABEL_OP3_404_67401_20141109_211809_inLine
+BABEL_OP3_404_67401_20141109_211809_outLine
+BABEL_OP3_404_67842_20141104_051753_inLine
+BABEL_OP3_404_67842_20141104_051753_outLine
+BABEL_OP3_404_68059_20141109_052011_inLine
+BABEL_OP3_404_68059_20141109_052011_outLine
+BABEL_OP3_404_68068_20141201_054518_inLine
+BABEL_OP3_404_68068_20141201_054518_outLine
+BABEL_OP3_404_68244_20141119_065540_inLine
+BABEL_OP3_404_68244_20141119_065540_outLine
+BABEL_OP3_404_68384_20141130_035214_inLine
+BABEL_OP3_404_68384_20141130_035214_outLine
+BABEL_OP3_404_68385_20141017_031005_inLine
+BABEL_OP3_404_68385_20141017_031005_outLine
+BABEL_OP3_404_68627_20141105_190511_inLine
+BABEL_OP3_404_68627_20141105_190511_outLine
+BABEL_OP3_404_68823_20150212_041147_inLine
+BABEL_OP3_404_68823_20150212_041147_outLine
+BABEL_OP3_404_69107_20141120_010459_inLine
+BABEL_OP3_404_69107_20141120_010459_outLine
+BABEL_OP3_404_69574_20141006_023156_inLine
+BABEL_OP3_404_69574_20141006_023156_outLine
+BABEL_OP3_404_69578_20141117_003921_inLine
+BABEL_OP3_404_69578_20141117_003921_outLine
+BABEL_OP3_404_70121_20141104_202610_inLine
+BABEL_OP3_404_70121_20141104_202610_outLine
+BABEL_OP3_404_70282_20141111_000251_inLine
+BABEL_OP3_404_70282_20141111_000251_outLine
+BABEL_OP3_404_70794_20141021_185105_inLine
+BABEL_OP3_404_70794_20141021_185105_outLine
+BABEL_OP3_404_71263_20141119_234747_inLine
+BABEL_OP3_404_71263_20141119_234747_outLine
+BABEL_OP3_404_71401_20150206_070446_inLine
+BABEL_OP3_404_71401_20150206_070446_outLine
+BABEL_OP3_404_71404_20141023_215509_inLine
+BABEL_OP3_404_71404_20141023_215509_outLine
+BABEL_OP3_404_71566_20141130_035713_inLine
+BABEL_OP3_404_71566_20141130_035713_outLine
+BABEL_OP3_404_71566_20141130_040359_inLine
+BABEL_OP3_404_71566_20141130_040359_outLine
+BABEL_OP3_404_72844_20141007_033837_inLine
+BABEL_OP3_404_72844_20141007_033837_outLine
+BABEL_OP3_404_73119_20141026_232203_inLine
+BABEL_OP3_404_73119_20141026_232203_outLine
+BABEL_OP3_404_73485_20150512_234636_inLine
+BABEL_OP3_404_73485_20150512_234636_outLine
+BABEL_OP3_404_73837_20141026_191037_inLine
+BABEL_OP3_404_73837_20141026_191037_outLine
+BABEL_OP3_404_74641_20141108_223951_inLine
+BABEL_OP3_404_74641_20141108_223951_outLine
+BABEL_OP3_404_74799_20141109_222638_inLine
+BABEL_OP3_404_74799_20141109_222638_outLine
+BABEL_OP3_404_75869_20150527_230650_inLine
+BABEL_OP3_404_75869_20150527_230650_outLine
+BABEL_OP3_404_76437_20141019_202715_inLine
+BABEL_OP3_404_76437_20141019_202715_outLine
+BABEL_OP3_404_77126_20141022_202348_inLine
+BABEL_OP3_404_77126_20141022_202348_outLine
+BABEL_OP3_404_77391_20141026_222314_inLine
+BABEL_OP3_404_77391_20141026_222314_outLine
+BABEL_OP3_404_77427_20141030_192713_inLine
+BABEL_OP3_404_77427_20141030_192713_outLine
+BABEL_OP3_404_77730_20141014_201059_inLine
+BABEL_OP3_404_77730_20141014_201059_outLine
+BABEL_OP3_404_77990_20141024_215822_inLine
+BABEL_OP3_404_77990_20141024_215822_outLine
+BABEL_OP3_404_78016_20141029_233059_inLine
+BABEL_OP3_404_78016_20141029_233059_outLine
+BABEL_OP3_404_78254_20141025_202742_inLine
+BABEL_OP3_404_78254_20141025_202742_outLine
+BABEL_OP3_404_78254_20141025_204922_inLine
+BABEL_OP3_404_78254_20141025_204922_outLine
+BABEL_OP3_404_78511_20141201_003606_inLine
+BABEL_OP3_404_78511_20141201_003606_outLine
+BABEL_OP3_404_78976_20141025_183704_inLine
+BABEL_OP3_404_78976_20141025_183704_outLine
+BABEL_OP3_404_79139_20141117_054733_inLine
+BABEL_OP3_404_79139_20141117_054733_outLine
+BABEL_OP3_404_79751_20141101_232250_inLine
+BABEL_OP3_404_79751_20141101_232250_outLine
+BABEL_OP3_404_80439_20141026_005410_inLine
+BABEL_OP3_404_80439_20141026_005410_outLine
+BABEL_OP3_404_81213_20141102_205052_inLine
+BABEL_OP3_404_81213_20141102_205052_outLine
+BABEL_OP3_404_81229_20141117_041745_inLine
+BABEL_OP3_404_81229_20141117_041745_outLine
+BABEL_OP3_404_81971_20141022_025641_inLine
+BABEL_OP3_404_81971_20141022_025641_outLine
+BABEL_OP3_404_82089_20141117_045302_inLine
+BABEL_OP3_404_82089_20141117_045302_outLine
+BABEL_OP3_404_82303_20150614_024236_inLine
+BABEL_OP3_404_82303_20150614_024236_outLine
+BABEL_OP3_404_82473_20141026_060037_inLine
+BABEL_OP3_404_82473_20141026_060037_outLine
+BABEL_OP3_404_82637_20141021_010105_inLine
+BABEL_OP3_404_82637_20141021_010105_outLine
+BABEL_OP3_404_82742_20141201_234306_inLine
+BABEL_OP3_404_82742_20141201_234306_outLine
+BABEL_OP3_404_83062_20150523_220236_inLine
+BABEL_OP3_404_83062_20150523_220236_outLine
+BABEL_OP3_404_83238_20141119_180953_inLine
+BABEL_OP3_404_83238_20141119_180953_outLine
+BABEL_OP3_404_83366_20141120_192208_inLine
+BABEL_OP3_404_83366_20141120_192208_outLine
+BABEL_OP3_404_83775_20141030_230742_inLine
+BABEL_OP3_404_83775_20141030_230742_outLine
+BABEL_OP3_404_83851_20141028_203735_inLine
+BABEL_OP3_404_83851_20141028_203735_outLine
+BABEL_OP3_404_83929_20141018_184023_inLine
+BABEL_OP3_404_83929_20141018_184023_outLine
+BABEL_OP3_404_84055_20150504_002015_inLine
+BABEL_OP3_404_84055_20150504_002015_outLine
+BABEL_OP3_404_84061_20141030_205021_inLine
+BABEL_OP3_404_84061_20141030_205021_outLine
+BABEL_OP3_404_84339_20150502_014143_inLine
+BABEL_OP3_404_84339_20150502_014143_outLine
+BABEL_OP3_404_85048_20141127_023704_inLine
+BABEL_OP3_404_85048_20141127_023704_outLine
+BABEL_OP3_404_85254_20150620_035606_inLine
+BABEL_OP3_404_85254_20150620_035606_outLine
+BABEL_OP3_404_85322_20141008_235518_inLine
+BABEL_OP3_404_85322_20141008_235518_outLine
+BABEL_OP3_404_85651_20141211_032650_inLine
+BABEL_OP3_404_85651_20141211_032650_outLine
+BABEL_OP3_404_86191_20141027_013544_inLine
+BABEL_OP3_404_86191_20141027_013544_outLine
+BABEL_OP3_404_86472_20141201_011325_inLine
+BABEL_OP3_404_86472_20141201_011325_outLine
+BABEL_OP3_404_86635_20141127_204158_inLine
+BABEL_OP3_404_86635_20141127_204158_outLine
+BABEL_OP3_404_86722_20141029_192140_inLine
+BABEL_OP3_404_86722_20141029_192140_outLine
+BABEL_OP3_404_86888_20141119_022459_inLine
+BABEL_OP3_404_86888_20141119_022459_outLine
+BABEL_OP3_404_87470_20141114_214639_inLine
+BABEL_OP3_404_87470_20141114_214639_outLine
+BABEL_OP3_404_87629_20141127_020403_inLine
+BABEL_OP3_404_87629_20141127_020403_outLine
+BABEL_OP3_404_88260_20141103_234824_inLine
+BABEL_OP3_404_88260_20141103_234824_outLine
+BABEL_OP3_404_88445_20141119_043713_inLine
+BABEL_OP3_404_88445_20141119_043713_outLine
+BABEL_OP3_404_88661_20141127_025208_inLine
+BABEL_OP3_404_88661_20141127_025208_outLine
+BABEL_OP3_404_88669_20141119_000147_inLine
+BABEL_OP3_404_88669_20141119_000147_outLine
+BABEL_OP3_404_88783_20141201_045305_inLine
+BABEL_OP3_404_88783_20141201_045305_outLine
+BABEL_OP3_404_89045_20141022_193202_inLine
+BABEL_OP3_404_89045_20141022_193202_outLine
+BABEL_OP3_404_89372_20141010_000950_inLine
+BABEL_OP3_404_89372_20141010_000950_outLine
+BABEL_OP3_404_89650_20150220_222402_inLine
+BABEL_OP3_404_89650_20150220_222402_outLine
+BABEL_OP3_404_89650_20150220_224606_inLine
+BABEL_OP3_404_89650_20150220_224606_outLine
+BABEL_OP3_404_89665_20141103_202723_inLine
+BABEL_OP3_404_89665_20141103_202723_outLine
+BABEL_OP3_404_90930_20150119_021352_inLine
+BABEL_OP3_404_90930_20150119_021352_outLine
+BABEL_OP3_404_91463_20141116_023036_inLine
+BABEL_OP3_404_91463_20141116_023036_outLine
+BABEL_OP3_404_91825_20141009_181224_inLine
+BABEL_OP3_404_91825_20141009_181224_outLine
+BABEL_OP3_404_91825_20141009_183843_inLine
+BABEL_OP3_404_91825_20141009_183843_outLine
+BABEL_OP3_404_91971_20150217_041455_inLine
+BABEL_OP3_404_91971_20150217_041455_outLine
+BABEL_OP3_404_92698_20141117_072302_inLine
+BABEL_OP3_404_92698_20141117_072302_outLine
+BABEL_OP3_404_92736_20141201_011442_inLine
+BABEL_OP3_404_92736_20141201_011442_outLine
+BABEL_OP3_404_94025_20141129_180207_inLine
+BABEL_OP3_404_94025_20141129_180207_outLine
+BABEL_OP3_404_94869_20141007_194254_inLine
+BABEL_OP3_404_94869_20141007_194254_outLine
+BABEL_OP3_404_95966_20141129_060246_inLine
+BABEL_OP3_404_95966_20141129_060246_outLine
+BABEL_OP3_404_96376_20150503_033706_inLine
+BABEL_OP3_404_96376_20150503_033706_outLine
+BABEL_OP3_404_96504_20141103_031329_inLine
+BABEL_OP3_404_96504_20141103_031329_outLine
+BABEL_OP3_404_97461_20141118_230730_inLine
+BABEL_OP3_404_97461_20141118_230730_outLine
+BABEL_OP3_404_97557_20141119_230718_inLine
+BABEL_OP3_404_97557_20141119_230718_outLine
+BABEL_OP3_404_97588_20141018_234016_inLine
+BABEL_OP3_404_97588_20141018_234016_outLine
+BABEL_OP3_404_97588_20141018_235425_inLine
+BABEL_OP3_404_97588_20141018_235425_outLine
+BABEL_OP3_404_97896_20141116_221329_inLine
+BABEL_OP3_404_97896_20141116_221329_outLine
+BABEL_OP3_404_97988_20141201_030306_inLine
+BABEL_OP3_404_97988_20141201_030306_outLine
+BABEL_OP3_404_98888_20141113_212715_inLine
+BABEL_OP3_404_98888_20141113_212715_outLine
+BABEL_OP3_404_99202_20141108_210814_inLine
+BABEL_OP3_404_99202_20141108_210814_outLine
+BABEL_OP3_404_99487_20141021_053024_inLine
+BABEL_OP3_404_99487_20141021_053024_outLine
+BABEL_OP3_404_99594_20141105_194545_inLine
+BABEL_OP3_404_99594_20141105_194545_outLine
+BABEL_OP3_404_99813_20141120_025129_inLine
+BABEL_OP3_404_99813_20141120_025129_outLine
diff --git a/egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list b/egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list
new file mode 100644
index 00000000000..8d6682cc789
--- /dev/null
+++ b/egs/babel/s5d/conf/lists/404-georgian/untranscribed-training.list
@@ -0,0 +1,535 @@
+BABEL_OP3_404_10058_20150526_034808_inLine
+BABEL_OP3_404_10411_20150611_172027_inLine
+BABEL_OP3_404_10411_20150611_172027_outLine
+BABEL_OP3_404_10938_20141030_023413_inLine
+BABEL_OP3_404_10938_20141030_023413_outLine
+BABEL_OP3_404_11352_20150513_002642_inLine
+BABEL_OP3_404_11352_20150513_002642_outLine
+BABEL_OP3_404_11859_20150611_041737_inLine
+BABEL_OP3_404_11859_20150611_041737_outLine
+BABEL_OP3_404_12220_20141116_205911_inLine
+BABEL_OP3_404_12220_20141116_205911_outLine
+BABEL_OP3_404_12609_20150524_172934_inLine
+BABEL_OP3_404_12609_20150524_172934_outLine
+BABEL_OP3_404_13126_20150524_221540_inLine
+BABEL_OP3_404_13126_20150524_221540_outLine
+BABEL_OP3_404_14158_20141130_030130_inLine
+BABEL_OP3_404_14158_20141130_030130_outLine
+BABEL_OP3_404_15024_20141118_234824_inLine
+BABEL_OP3_404_15024_20141118_234824_outLine
+BABEL_OP3_404_15042_20150506_232829_inLine
+BABEL_OP3_404_15042_20150506_232829_outLine
+BABEL_OP3_404_15535_20141129_021659_inLine
+BABEL_OP3_404_15535_20141129_021659_outLine
+BABEL_OP3_404_15638_20141127_220502_outLine
+BABEL_OP3_404_15902_20141020_173105_outLine
+BABEL_OP3_404_16475_20141116_052010_outLine
+BABEL_OP3_404_16601_20141201_041704_inLine
+BABEL_OP3_404_16601_20141201_041704_outLine
+BABEL_OP3_404_17320_20150524_213213_inLine
+BABEL_OP3_404_17320_20150524_213213_outLine
+BABEL_OP3_404_17420_20150503_201902_inLine
+BABEL_OP3_404_17420_20150503_201902_outLine
+BABEL_OP3_404_17420_20150527_025815_inLine
+BABEL_OP3_404_17420_20150527_025815_outLine
+BABEL_OP3_404_17420_20150527_034621_inLine
+BABEL_OP3_404_17420_20150527_034621_outLine
+BABEL_OP3_404_17520_20141113_032534_inLine
+BABEL_OP3_404_17567_20141117_182919_inLine
+BABEL_OP3_404_17567_20141117_182919_outLine
+BABEL_OP3_404_17573_20141129_035040_inLine
+BABEL_OP3_404_17573_20141129_035040_outLine
+BABEL_OP3_404_17890_20141128_040046_inLine
+BABEL_OP3_404_17890_20141128_040046_outLine
+BABEL_OP3_404_17923_20141022_231429_outLine
+BABEL_OP3_404_18118_20150503_165936_inLine
+BABEL_OP3_404_18118_20150503_165936_outLine
+BABEL_OP3_404_18291_20150611_062705_outLine
+BABEL_OP3_404_18291_20150611_063700_outLine
+BABEL_OP3_404_18766_20150610_064349_inLine
+BABEL_OP3_404_19120_20150525_014657_inLine
+BABEL_OP3_404_19120_20150525_014657_outLine
+BABEL_OP3_404_19120_20150525_015635_inLine
+BABEL_OP3_404_19120_20150525_015635_outLine
+BABEL_OP3_404_19877_20150506_202237_outLine
+BABEL_OP3_404_20454_20150218_171143_inLine
+BABEL_OP3_404_20454_20150218_171143_outLine
+BABEL_OP3_404_21159_20150615_021612_inLine
+BABEL_OP3_404_21435_20150523_030702_inLine
+BABEL_OP3_404_21435_20150523_030702_outLine
+BABEL_OP3_404_21581_20141101_011021_inLine
+BABEL_OP3_404_21581_20141101_011021_outLine
+BABEL_OP3_404_21807_20141112_225225_outLine
+BABEL_OP3_404_22591_20150217_220714_inLine
+BABEL_OP3_404_24209_20150212_224614_inLine
+BABEL_OP3_404_24239_20150517_203015_inLine
+BABEL_OP3_404_24323_20141117_020615_outLine
+BABEL_OP3_404_24501_20150522_030231_inLine
+BABEL_OP3_404_24586_20150524_190657_inLine
+BABEL_OP3_404_24586_20150524_190657_outLine
+BABEL_OP3_404_24590_20141116_230233_inLine
+BABEL_OP3_404_24590_20141116_230233_outLine
+BABEL_OP3_404_25068_20150206_022730_outLine
+BABEL_OP3_404_25085_20150611_040906_inLine
+BABEL_OP3_404_25085_20150611_040906_outLine
+BABEL_OP3_404_25412_20141120_031532_inLine
+BABEL_OP3_404_25412_20141120_031532_outLine
+BABEL_OP3_404_25496_20150613_034126_inLine
+BABEL_OP3_404_25496_20150613_034126_outLine
+BABEL_OP3_404_26398_20150527_032152_inLine
+BABEL_OP3_404_26398_20150527_032152_outLine
+BABEL_OP3_404_26478_20150617_004029_inLine
+BABEL_OP3_404_26478_20150617_004029_outLine
+BABEL_OP3_404_26836_20141102_024528_inLine
+BABEL_OP3_404_26836_20141102_024528_outLine
+BABEL_OP3_404_27203_20141119_185720_inLine
+BABEL_OP3_404_27203_20141119_185720_outLine
+BABEL_OP3_404_27203_20141119_191138_inLine
+BABEL_OP3_404_27203_20141119_191138_outLine
+BABEL_OP3_404_27590_20141128_051454_inLine
+BABEL_OP3_404_28280_20150619_024509_inLine
+BABEL_OP3_404_28280_20150619_024509_outLine
+BABEL_OP3_404_28280_20150619_025848_inLine
+BABEL_OP3_404_28280_20150619_025848_outLine
+BABEL_OP3_404_28600_20141201_223206_inLine
+BABEL_OP3_404_28600_20141201_223206_outLine
+BABEL_OP3_404_28945_20141104_060349_outLine
+BABEL_OP3_404_29076_20141109_215142_inLine
+BABEL_OP3_404_29076_20141109_215142_outLine
+BABEL_OP3_404_29230_20150611_051340_inLine
+BABEL_OP3_404_29230_20150611_051340_outLine
+BABEL_OP3_404_29439_20150524_201524_inLine
+BABEL_OP3_404_29439_20150524_201524_outLine
+BABEL_OP3_404_30497_20150525_194737_inLine
+BABEL_OP3_404_30497_20150525_194737_outLine
+BABEL_OP3_404_30645_20141019_220859_inLine
+BABEL_OP3_404_30653_20150514_014515_inLine
+BABEL_OP3_404_31267_20150615_011004_outLine
+BABEL_OP3_404_31484_20141122_232804_inLine
+BABEL_OP3_404_31484_20141122_232804_outLine
+BABEL_OP3_404_31919_20150526_220911_inLine
+BABEL_OP3_404_31919_20150526_220911_outLine
+BABEL_OP3_404_32630_20150609_012137_inLine
+BABEL_OP3_404_32630_20150609_012137_outLine
+BABEL_OP3_404_32959_20141201_005331_inLine
+BABEL_OP3_404_32959_20141201_005331_outLine
+BABEL_OP3_404_32998_20141112_054111_inLine
+BABEL_OP3_404_34328_20141119_054513_outLine
+BABEL_OP3_404_34328_20141119_055432_outLine
+BABEL_OP3_404_34811_20141109_001009_inLine
+BABEL_OP3_404_34811_20141109_001009_outLine
+BABEL_OP3_404_34899_20150611_060602_outLine
+BABEL_OP3_404_35008_20141201_023042_inLine
+BABEL_OP3_404_35008_20141201_023042_outLine
+BABEL_OP3_404_35181_20150526_211416_inLine
+BABEL_OP3_404_35181_20150526_211416_outLine
+BABEL_OP3_404_35706_20150523_015900_inLine
+BABEL_OP3_404_35706_20150523_015900_outLine
+BABEL_OP3_404_35786_20150604_015518_inLine
+BABEL_OP3_404_35786_20150604_015518_outLine
+BABEL_OP3_404_36017_20150528_192934_inLine
+BABEL_OP3_404_36017_20150528_192934_outLine
+BABEL_OP3_404_36039_20150526_230125_inLine
+BABEL_OP3_404_36039_20150526_230125_outLine
+BABEL_OP3_404_36059_20150601_023254_inLine
+BABEL_OP3_404_36059_20150601_023254_outLine
+BABEL_OP3_404_36059_20150601_033346_inLine
+BABEL_OP3_404_36059_20150601_033346_outLine
+BABEL_OP3_404_36147_20150211_013803_outLine
+BABEL_OP3_404_36219_20141104_012216_inLine
+BABEL_OP3_404_36219_20141104_012216_outLine
+BABEL_OP3_404_36642_20150610_161207_inLine
+BABEL_OP3_404_36642_20150610_161207_outLine
+BABEL_OP3_404_37290_20141115_050457_inLine
+BABEL_OP3_404_37290_20141115_050457_outLine
+BABEL_OP3_404_38125_20150526_233108_inLine
+BABEL_OP3_404_38125_20150526_233108_outLine
+BABEL_OP3_404_38323_20150615_021843_inLine
+BABEL_OP3_404_38340_20141103_231545_inLine
+BABEL_OP3_404_38340_20141103_231545_outLine
+BABEL_OP3_404_38554_20141010_224451_inLine
+BABEL_OP3_404_38554_20141010_224451_outLine
+BABEL_OP3_404_38664_20141030_175135_inLine
+BABEL_OP3_404_38664_20141030_175135_outLine
+BABEL_OP3_404_38979_20150503_202406_outLine
+BABEL_OP3_404_39099_20150511_053646_outLine
+BABEL_OP3_404_39307_20141022_200554_inLine
+BABEL_OP3_404_39307_20141022_201758_inLine
+BABEL_OP3_404_39426_20150527_181901_outLine
+BABEL_OP3_404_39744_20141023_002710_inLine
+BABEL_OP3_404_39893_20150611_034149_inLine
+BABEL_OP3_404_39920_20150503_205354_outLine
+BABEL_OP3_404_41097_20141129_055801_inLine
+BABEL_OP3_404_41097_20141129_055801_outLine
+BABEL_OP3_404_41272_20150503_232941_inLine
+BABEL_OP3_404_41334_20150617_041322_inLine
+BABEL_OP3_404_41400_20150515_021408_inLine
+BABEL_OP3_404_41692_20150604_005657_inLine
+BABEL_OP3_404_41692_20150604_005657_outLine
+BABEL_OP3_404_41745_20141114_235452_inLine
+BABEL_OP3_404_41745_20141114_235452_outLine
+BABEL_OP3_404_42155_20141127_055149_inLine
+BABEL_OP3_404_42619_20141130_012456_outLine
+BABEL_OP3_404_42834_20141125_004837_inLine
+BABEL_OP3_404_42834_20141125_004837_outLine
+BABEL_OP3_404_42883_20150604_035732_inLine
+BABEL_OP3_404_42883_20150604_035732_outLine
+BABEL_OP3_404_43388_20141114_212210_inLine
+BABEL_OP3_404_43388_20141114_214120_inLine
+BABEL_OP3_404_43588_20150517_233637_inLine
+BABEL_OP3_404_43789_20141120_011327_outLine
+BABEL_OP3_404_44309_20150525_022635_inLine
+BABEL_OP3_404_44309_20150525_022635_outLine
+BABEL_OP3_404_44478_20150512_225118_inLine
+BABEL_OP3_404_45106_20141119_050859_inLine
+BABEL_OP3_404_45106_20141119_050859_outLine
+BABEL_OP3_404_45374_20150122_014830_outLine
+BABEL_OP3_404_45374_20150122_015920_outLine
+BABEL_OP3_404_45459_20150525_020410_inLine
+BABEL_OP3_404_45459_20150525_020410_outLine
+BABEL_OP3_404_45699_20150205_021829_inLine
+BABEL_OP3_404_45851_20150514_155157_inLine
+BABEL_OP3_404_45851_20150514_155157_outLine
+BABEL_OP3_404_45908_20150515_004218_outLine
+BABEL_OP3_404_46310_20141015_051100_inLine
+BABEL_OP3_404_46310_20141015_051100_outLine
+BABEL_OP3_404_46315_20141129_012912_inLine
+BABEL_OP3_404_46315_20141129_012912_outLine
+BABEL_OP3_404_46688_20141015_211329_inLine
+BABEL_OP3_404_46688_20141015_211329_outLine
+BABEL_OP3_404_46712_20141027_224004_inLine
+BABEL_OP3_404_46712_20141027_224004_outLine
+BABEL_OP3_404_46974_20141128_055136_inLine
+BABEL_OP3_404_46974_20141128_055136_outLine
+BABEL_OP3_404_47156_20150625_025324_inLine
+BABEL_OP3_404_47156_20150625_025324_outLine
+BABEL_OP3_404_47823_20141201_044425_inLine
+BABEL_OP3_404_47823_20141201_044425_outLine
+BABEL_OP3_404_48016_20150615_000741_inLine
+BABEL_OP3_404_48016_20150615_000741_outLine
+BABEL_OP3_404_48610_20141013_011505_inLine
+BABEL_OP3_404_48610_20141013_012904_inLine
+BABEL_OP3_404_48663_20150512_202837_inLine
+BABEL_OP3_404_48663_20150512_202837_outLine
+BABEL_OP3_404_49306_20150524_003356_inLine
+BABEL_OP3_404_49306_20150524_003356_outLine
+BABEL_OP3_404_49630_20141128_020114_inLine
+BABEL_OP3_404_49630_20141128_020114_outLine
+BABEL_OP3_404_49767_20150613_050113_inLine
+BABEL_OP3_404_49767_20150613_050113_outLine
+BABEL_OP3_404_49775_20141011_005306_inLine
+BABEL_OP3_404_49775_20141011_005306_outLine
+BABEL_OP3_404_49945_20150610_154709_inLine
+BABEL_OP3_404_50601_20141127_032527_inLine
+BABEL_OP3_404_50601_20141127_032527_outLine
+BABEL_OP3_404_50779_20141115_012852_inLine
+BABEL_OP3_404_50779_20141115_012852_outLine
+BABEL_OP3_404_50810_20141007_234432_inLine
+BABEL_OP3_404_50810_20141007_234432_outLine
+BABEL_OP3_404_51414_20150604_001601_inLine
+BABEL_OP3_404_51414_20150604_001601_outLine
+BABEL_OP3_404_51484_20141202_000325_inLine
+BABEL_OP3_404_51484_20141202_000325_outLine
+BABEL_OP3_404_51701_20150620_010924_outLine
+BABEL_OP3_404_52070_20150620_014422_outLine
+BABEL_OP3_404_52070_20150620_020559_outLine
+BABEL_OP3_404_52404_20141125_004855_inLine
+BABEL_OP3_404_52404_20141125_004855_outLine
+BABEL_OP3_404_53063_20141201_005237_inLine
+BABEL_OP3_404_53063_20141201_005237_outLine
+BABEL_OP3_404_53072_20150518_015132_inLine
+BABEL_OP3_404_53415_20150503_225920_inLine
+BABEL_OP3_404_53415_20150503_225920_outLine
+BABEL_OP3_404_53492_20150525_055025_inLine
+BABEL_OP3_404_53492_20150525_055025_outLine
+BABEL_OP3_404_53665_20150526_004549_inLine
+BABEL_OP3_404_53917_20150503_205456_outLine
+BABEL_OP3_404_53957_20141201_051933_inLine
+BABEL_OP3_404_54477_20141211_033627_inLine
+BABEL_OP3_404_54477_20141211_033627_outLine
+BABEL_OP3_404_55013_20150525_222257_inLine
+BABEL_OP3_404_55013_20150525_222257_outLine
+BABEL_OP3_404_55267_20141130_212756_inLine
+BABEL_OP3_404_55349_20150523_031602_inLine
+BABEL_OP3_404_55349_20150523_031602_outLine
+BABEL_OP3_404_56019_20150502_020750_inLine
+BABEL_OP3_404_56019_20150502_020750_outLine
+BABEL_OP3_404_56076_20150516_164959_inLine
+BABEL_OP3_404_56076_20150516_164959_outLine
+BABEL_OP3_404_56331_20150526_020747_inLine
+BABEL_OP3_404_56331_20150526_020747_outLine
+BABEL_OP3_404_56743_20141114_223719_inLine
+BABEL_OP3_404_56743_20141114_223719_outLine
+BABEL_OP3_404_57065_20141201_002920_inLine
+BABEL_OP3_404_57219_20150618_045613_inLine
+BABEL_OP3_404_57219_20150618_045613_outLine
+BABEL_OP3_404_57464_20150523_224617_inLine
+BABEL_OP3_404_57542_20150526_233832_inLine
+BABEL_OP3_404_57542_20150526_233832_outLine
+BABEL_OP3_404_57542_20150526_235003_inLine
+BABEL_OP3_404_57542_20150526_235003_outLine
+BABEL_OP3_404_58006_20150526_024205_inLine
+BABEL_OP3_404_58006_20150526_024205_outLine
+BABEL_OP3_404_58026_20150615_004130_inLine
+BABEL_OP3_404_58026_20150615_004130_outLine
+BABEL_OP3_404_58915_20150611_034220_outLine
+BABEL_OP3_404_59307_20150504_003405_inLine
+BABEL_OP3_404_59307_20150504_003405_outLine
+BABEL_OP3_404_59864_20150602_014458_inLine
+BABEL_OP3_404_60299_20150611_040929_inLine
+BABEL_OP3_404_60310_20141130_231532_inLine
+BABEL_OP3_404_60310_20141130_231532_outLine
+BABEL_OP3_404_60352_20141201_060712_inLine
+BABEL_OP3_404_60352_20141201_060712_outLine
+BABEL_OP3_404_60352_20141201_061821_inLine
+BABEL_OP3_404_60352_20141201_061821_outLine
+BABEL_OP3_404_60458_20150609_021527_inLine
+BABEL_OP3_404_60458_20150609_021527_outLine
+BABEL_OP3_404_60477_20150613_223056_inLine
+BABEL_OP3_404_60477_20150613_224002_inLine
+BABEL_OP3_404_60498_20150606_022221_inLine
+BABEL_OP3_404_60498_20150606_022221_outLine
+BABEL_OP3_404_60706_20141020_215729_inLine
+BABEL_OP3_404_60706_20141020_215729_outLine
+BABEL_OP3_404_61888_20150504_171019_inLine
+BABEL_OP3_404_61971_20150525_020101_outLine
+BABEL_OP3_404_62360_20150517_033230_inLine
+BABEL_OP3_404_62360_20150517_033230_outLine
+BABEL_OP3_404_62724_20141130_200827_inLine
+BABEL_OP3_404_62724_20141130_200827_outLine
+BABEL_OP3_404_62852_20141013_054854_outLine
+BABEL_OP3_404_63425_20141126_054504_inLine
+BABEL_OP3_404_63481_20141020_221014_outLine
+BABEL_OP3_404_63481_20141020_224225_outLine
+BABEL_OP3_404_63670_20141130_050318_inLine
+BABEL_OP3_404_63670_20141130_050318_outLine
+BABEL_OP3_404_63906_20150525_050310_inLine
+BABEL_OP3_404_63906_20150525_050310_outLine
+BABEL_OP3_404_63999_20150610_041309_inLine
+BABEL_OP3_404_64014_20150503_032745_inLine
+BABEL_OP3_404_64014_20150503_032745_outLine
+BABEL_OP3_404_64722_20150514_034208_outLine
+BABEL_OP3_404_64759_20141014_044027_inLine
+BABEL_OP3_404_64759_20141014_045519_inLine
+BABEL_OP3_404_64796_20141022_055826_inLine
+BABEL_OP3_404_65561_20141124_060558_inLine
+BABEL_OP3_404_65561_20141124_060558_outLine
+BABEL_OP3_404_65640_20150528_211835_inLine
+BABEL_OP3_404_65640_20150528_211835_outLine
+BABEL_OP3_404_66967_20141008_202611_inLine
+BABEL_OP3_404_66967_20141008_202611_outLine
+BABEL_OP3_404_67152_20150503_201836_inLine
+BABEL_OP3_404_67152_20150503_201836_outLine
+BABEL_OP3_404_67304_20150211_054416_inLine
+BABEL_OP3_404_67304_20150211_054416_outLine
+BABEL_OP3_404_67552_20141126_011955_inLine
+BABEL_OP3_404_67552_20141126_011955_outLine
+BABEL_OP3_404_68306_20141126_180315_inLine
+BABEL_OP3_404_68306_20141126_180315_outLine
+BABEL_OP3_404_69096_20150512_165126_inLine
+BABEL_OP3_404_69096_20150512_165126_outLine
+BABEL_OP3_404_69153_20141130_221412_inLine
+BABEL_OP3_404_69153_20141130_221412_outLine
+BABEL_OP3_404_69153_20141130_222842_inLine
+BABEL_OP3_404_69153_20141130_222842_outLine
+BABEL_OP3_404_69474_20141128_051323_outLine
+BABEL_OP3_404_69633_20141129_051648_inLine
+BABEL_OP3_404_69633_20141129_051648_outLine
+BABEL_OP3_404_69636_20141126_061322_inLine
+BABEL_OP3_404_69636_20141126_061322_outLine
+BABEL_OP3_404_69885_20150503_011226_inLine
+BABEL_OP3_404_69885_20150503_011226_outLine
+BABEL_OP3_404_69937_20150620_015912_inLine
+BABEL_OP3_404_69964_20150524_015556_inLine
+BABEL_OP3_404_69964_20150524_015556_outLine
+BABEL_OP3_404_69982_20150625_035440_outLine
+BABEL_OP3_404_70221_20141124_052004_inLine
+BABEL_OP3_404_70221_20141124_052004_outLine
+BABEL_OP3_404_70460_20150527_015340_inLine
+BABEL_OP3_404_70460_20150527_015340_outLine
+BABEL_OP3_404_70526_20150501_015444_inLine
+BABEL_OP3_404_70526_20150501_015444_outLine
+BABEL_OP3_404_70713_20150527_013058_inLine
+BABEL_OP3_404_70713_20150527_013058_outLine
+BABEL_OP3_404_71189_20150523_005918_inLine
+BABEL_OP3_404_71189_20150523_005918_outLine
+BABEL_OP3_404_71278_20150211_052730_inLine
+BABEL_OP3_404_71278_20150211_052730_outLine
+BABEL_OP3_404_71278_20150211_054040_inLine
+BABEL_OP3_404_71278_20150211_054040_outLine
+BABEL_OP3_404_71333_20141102_023503_inLine
+BABEL_OP3_404_71333_20141102_023503_outLine
+BABEL_OP3_404_71460_20150206_015309_outLine
+BABEL_OP3_404_71559_20141210_220929_outLine
+BABEL_OP3_404_71780_20141105_055543_inLine
+BABEL_OP3_404_71780_20141105_055543_outLine
+BABEL_OP3_404_72319_20150502_041426_inLine
+BABEL_OP3_404_72319_20150502_041426_outLine
+BABEL_OP3_404_72733_20150515_044419_inLine
+BABEL_OP3_404_72733_20150515_044419_outLine
+BABEL_OP3_404_73072_20141012_012029_inLine
+BABEL_OP3_404_73072_20141012_012029_outLine
+BABEL_OP3_404_73258_20141117_010123_inLine
+BABEL_OP3_404_73258_20141117_010123_outLine
+BABEL_OP3_404_73964_20150512_205010_inLine
+BABEL_OP3_404_73964_20150512_205010_outLine
+BABEL_OP3_404_74728_20150503_042547_inLine
+BABEL_OP3_404_74728_20150503_042547_outLine
+BABEL_OP3_404_75465_20141129_223330_outLine
+BABEL_OP3_404_75975_20150127_051140_outLine
+BABEL_OP3_404_76126_20141201_202238_inLine
+BABEL_OP3_404_76126_20141201_202238_outLine
+BABEL_OP3_404_76238_20141129_223455_inLine
+BABEL_OP3_404_76238_20141129_223455_outLine
+BABEL_OP3_404_76372_20150601_014341_inLine
+BABEL_OP3_404_76372_20150601_014341_outLine
+BABEL_OP3_404_76444_20141127_032124_inLine
+BABEL_OP3_404_76444_20141127_032124_outLine
+BABEL_OP3_404_76482_20150618_063131_outLine
+BABEL_OP3_404_76683_20141110_191551_inLine
+BABEL_OP3_404_76683_20141110_191551_outLine
+BABEL_OP3_404_76837_20150124_222250_outLine
+BABEL_OP3_404_76970_20150625_191722_inLine
+BABEL_OP3_404_77146_20141019_060916_inLine
+BABEL_OP3_404_77242_20150612_024655_inLine
+BABEL_OP3_404_77567_20141021_021210_inLine
+BABEL_OP3_404_77567_20141021_021210_outLine
+BABEL_OP3_404_77803_20141020_030844_inLine
+BABEL_OP3_404_77803_20141020_030844_outLine
+BABEL_OP3_404_78454_20141115_043455_inLine
+BABEL_OP3_404_78749_20150620_025728_inLine
+BABEL_OP3_404_78749_20150620_025728_outLine
+BABEL_OP3_404_79190_20141108_232204_inLine
+BABEL_OP3_404_79190_20141108_232204_outLine
+BABEL_OP3_404_79590_20141129_025808_outLine
+BABEL_OP3_404_79820_20141104_045340_inLine
+BABEL_OP3_404_79820_20141104_045340_outLine
+BABEL_OP3_404_79858_20141015_200446_inLine
+BABEL_OP3_404_79898_20150620_022648_inLine
+BABEL_OP3_404_79898_20150620_022648_outLine
+BABEL_OP3_404_79898_20150620_024014_inLine
+BABEL_OP3_404_79898_20150620_024014_outLine
+BABEL_OP3_404_80069_20150614_233606_inLine
+BABEL_OP3_404_80069_20150614_233606_outLine
+BABEL_OP3_404_80306_20141119_003833_inLine
+BABEL_OP3_404_80306_20141119_003833_outLine
+BABEL_OP3_404_80306_20141119_005121_inLine
+BABEL_OP3_404_80306_20141119_005121_outLine
+BABEL_OP3_404_80559_20141022_010255_inLine
+BABEL_OP3_404_80655_20150525_221544_inLine
+BABEL_OP3_404_80655_20150525_221544_outLine
+BABEL_OP3_404_80897_20141119_233718_inLine
+BABEL_OP3_404_80897_20141119_233718_outLine
+BABEL_OP3_404_81149_20150525_003741_inLine
+BABEL_OP3_404_81149_20150525_003741_outLine
+BABEL_OP3_404_81427_20141030_015136_inLine
+BABEL_OP3_404_81427_20141030_015136_outLine
+BABEL_OP3_404_81854_20150610_060437_inLine
+BABEL_OP3_404_82626_20150615_014517_inLine
+BABEL_OP3_404_82863_20141119_044230_inLine
+BABEL_OP3_404_82863_20141119_044230_outLine
+BABEL_OP3_404_83651_20141102_170912_inLine
+BABEL_OP3_404_83651_20141102_170912_outLine
+BABEL_OP3_404_83771_20150604_012300_outLine
+BABEL_OP3_404_83974_20150617_022055_inLine
+BABEL_OP3_404_84125_20141018_023340_inLine
+BABEL_OP3_404_84125_20141018_023340_outLine
+BABEL_OP3_404_84458_20141130_053628_outLine
+BABEL_OP3_404_84815_20141127_011952_inLine
+BABEL_OP3_404_84815_20141127_013345_inLine
+BABEL_OP3_404_85047_20141117_014630_inLine
+BABEL_OP3_404_85047_20141117_014630_outLine
+BABEL_OP3_404_85340_20141103_022707_inLine
+BABEL_OP3_404_85340_20141103_022707_outLine
+BABEL_OP3_404_86597_20150612_170328_inLine
+BABEL_OP3_404_86597_20150612_170328_outLine
+BABEL_OP3_404_87074_20141105_190107_outLine
+BABEL_OP3_404_87777_20141127_040747_inLine
+BABEL_OP3_404_87777_20141127_040747_outLine
+BABEL_OP3_404_87871_20141201_023608_inLine
+BABEL_OP3_404_87871_20141201_023608_outLine
+BABEL_OP3_404_87921_20141201_023029_inLine
+BABEL_OP3_404_87921_20141201_023029_outLine
+BABEL_OP3_404_88873_20141028_190127_inLine
+BABEL_OP3_404_88873_20141028_190127_outLine
+BABEL_OP3_404_89330_20150616_002908_inLine
+BABEL_OP3_404_89330_20150616_002908_outLine
+BABEL_OP3_404_89943_20141105_211847_outLine
+BABEL_OP3_404_90347_20141119_012016_inLine
+BABEL_OP3_404_90347_20141119_012016_outLine
+BABEL_OP3_404_90760_20150611_151739_inLine
+BABEL_OP3_404_90760_20150611_151739_outLine
+BABEL_OP3_404_90832_20150616_012728_inLine
+BABEL_OP3_404_90832_20150616_012728_outLine
+BABEL_OP3_404_91383_20150618_035815_inLine
+BABEL_OP3_404_91475_20150614_034536_inLine
+BABEL_OP3_404_91581_20141129_045608_inLine
+BABEL_OP3_404_91581_20141129_045608_outLine
+BABEL_OP3_404_91581_20141129_050730_inLine
+BABEL_OP3_404_91581_20141129_050730_outLine
+BABEL_OP3_404_91593_20150611_021825_inLine
+BABEL_OP3_404_91593_20150611_021825_outLine
+BABEL_OP3_404_91884_20150503_022858_inLine
+BABEL_OP3_404_91884_20150503_022858_outLine
+BABEL_OP3_404_91888_20150512_191012_inLine
+BABEL_OP3_404_91888_20150512_191012_outLine
+BABEL_OP3_404_91891_20141129_005825_inLine
+BABEL_OP3_404_91891_20141129_005825_outLine
+BABEL_OP3_404_91944_20141022_021002_inLine
+BABEL_OP3_404_91977_20141122_230420_outLine
+BABEL_OP3_404_92176_20141119_195614_inLine
+BABEL_OP3_404_92176_20141119_195614_outLine
+BABEL_OP3_404_92281_20150625_185123_inLine
+BABEL_OP3_404_92757_20150525_200048_inLine
+BABEL_OP3_404_92757_20150525_200048_outLine
+BABEL_OP3_404_92792_20150503_182854_outLine
+BABEL_OP3_404_92792_20150525_025523_outLine
+BABEL_OP3_404_92942_20141120_022830_inLine
+BABEL_OP3_404_92942_20141120_022830_outLine
+BABEL_OP3_404_93007_20150615_051230_inLine
+BABEL_OP3_404_93007_20150615_051230_outLine
+BABEL_OP3_404_93858_20150611_043732_inLine
+BABEL_OP3_404_94002_20141119_015307_inLine
+BABEL_OP3_404_94002_20141119_015307_outLine
+BABEL_OP3_404_94333_20141020_024439_outLine
+BABEL_OP3_404_94487_20150518_005132_outLine
+BABEL_OP3_404_95077_20141201_055702_outLine
+BABEL_OP3_404_95269_20141105_221810_inLine
+BABEL_OP3_404_95269_20141105_221810_outLine
+BABEL_OP3_404_95338_20150610_211203_inLine
+BABEL_OP3_404_95338_20150610_211203_outLine
+BABEL_OP3_404_95399_20141119_001023_inLine
+BABEL_OP3_404_95399_20141119_001023_outLine
+BABEL_OP3_404_95583_20141019_010741_inLine
+BABEL_OP3_404_95583_20141019_010741_outLine
+BABEL_OP3_404_96059_20150524_042224_outLine
+BABEL_OP3_404_96205_20141119_033053_inLine
+BABEL_OP3_404_96205_20141119_033053_outLine
+BABEL_OP3_404_96205_20141119_034909_inLine
+BABEL_OP3_404_96205_20141119_034909_outLine
+BABEL_OP3_404_96247_20150526_202623_outLine
+BABEL_OP3_404_96690_20141117_053054_inLine
+BABEL_OP3_404_96690_20141117_053054_outLine
+BABEL_OP3_404_96808_20150609_034129_inLine
+BABEL_OP3_404_97097_20150601_042649_outLine
+BABEL_OP3_404_97136_20150528_011250_inLine
+BABEL_OP3_404_97136_20150528_011250_outLine
+BABEL_OP3_404_97911_20150613_195820_outLine
+BABEL_OP3_404_98165_20141030_214051_inLine
+BABEL_OP3_404_98165_20141030_214051_outLine
+BABEL_OP3_404_98192_20150617_021906_outLine
+BABEL_OP3_404_98489_20141102_002030_inLine
+BABEL_OP3_404_98489_20141102_004054_inLine
+BABEL_OP3_404_98678_20150528_021605_inLine
+BABEL_OP3_404_98678_20150528_023029_inLine
+BABEL_OP3_404_99289_20150521_220314_inLine
+BABEL_OP3_404_99289_20150521_220314_outLine
+BABEL_OP3_404_99289_20150521_222144_inLine
+BABEL_OP3_404_99289_20150521_222144_outLine
+BABEL_OP3_404_99718_20141019_051850_inLine
+BABEL_OP3_404_99718_20141019_051850_outLine
+BABEL_OP3_404_99718_20141019_053305_inLine
+BABEL_OP3_404_99718_20141019_053305_outLine
+BABEL_OP3_404_99732_20141130_232553_inLine
+BABEL_OP3_404_99732_20141130_232553_outLine
+BABEL_OP3_404_99920_20141022_052026_inLine
diff --git a/egs/babel/s5d/local/arpa2G.sh b/egs/babel/s5d/local/arpa2G.sh
index 40c269fbb22..887b393b459 100755
--- a/egs/babel/s5d/local/arpa2G.sh
+++ b/egs/babel/s5d/local/arpa2G.sh
@@ -85,7 +85,8 @@ if [ ! -z "$oov_prob_file" ]; then
           print "$log10prob $word\n";
        }
      }} print STDERR "Ceilinged $ceilinged unk-probs\n";' \
-       $oov_prob_file $min_prob $unk_fraction | gzip -c > $destdir/lm_tmp.gz
+       $oov_prob_file $min_prob $unk_fraction | \
+  ngram  -unk -lm - -write-lm $destdir/lm_tmp.gz
   lmfile=$destdir/lm_tmp.gz
 fi
 
diff --git a/egs/babel/s5d/local/chain/run_blstm.sh b/egs/babel/s5d/local/chain/run_blstm.sh
index 6d13c55fc7d..f098604d04a 100755
--- a/egs/babel/s5d/local/chain/run_blstm.sh
+++ b/egs/babel/s5d/local/chain/run_blstm.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab1.sh b/egs/babel/s5d/local/chain/run_blstm_bab1.sh
index ba8da0e14bc..95c7e9f28aa 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab1.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab1.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab2.sh b/egs/babel/s5d/local/chain/run_blstm_bab2.sh
index f5d698e262c..a6dd4cb9566 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab2.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab2.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab3.sh b/egs/babel/s5d/local/chain/run_blstm_bab3.sh
index 7ad51204c6f..52f085f8942 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab3.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab3.sh
@@ -136,7 +136,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab4.sh b/egs/babel/s5d/local/chain/run_blstm_bab4.sh
index 72aaeb8778f..47704e80ae4 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab4.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab4.sh
@@ -135,7 +135,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_bab5.sh b/egs/babel/s5d/local/chain/run_blstm_bab5.sh
index 1bae225022e..73c6a4089ed 100755
--- a/egs/babel/s5d/local/chain/run_blstm_bab5.sh
+++ b/egs/babel/s5d/local/chain/run_blstm_bab5.sh
@@ -135,7 +135,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_blstm_xconfig.sh b/egs/babel/s5d/local/chain/run_blstm_xconfig.sh
new file mode 100755
index 00000000000..27e1a571ad0
--- /dev/null
+++ b/egs/babel/s5d/local/chain/run_blstm_xconfig.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+
+# by default, with cleanup:
+# local/chain/run_blstm.sh
+# %WER 46.8 | 19252 60586 | 57.6 28.5 13.8 4.5 46.8 31.7 | -0.643 | exp/chain_cleaned/blstm_sp_bi/decode_dev10h.pem/score_8/penalty_0.25/dev10h.pem.ctm.sys
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=17
+nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri5_cleaned  # the gmm for the target data
+langdir=data/langp/tri5_ali
+num_threads_ubm=12
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+blstm_affix=_xconfig  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/blstm_sp_bi/egs  # you can set this to use previously dumped egs.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chain/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/blstm${blstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $langdir data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    $langdir $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  #echo "$0: creating neural net configs";
+  #steps/nnet3/lstm/make_configs.py  \
+  #  --self-repair-scale-nonlinearity 0.00001 \
+  #  --self-repair-scale-clipgradient 1.0 \
+  # $dir/configs || exit 1;
+  echo "$0: creating neural net configs using the xconfig parser";
+
+	label_delay=0
+  xent_regularize=0.1
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+	
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  lstmp-layer name=blstm1-forward input=lda cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  lstmp-layer name=blstm1-backward input=lda cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=3
+  lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=3
+  lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=3
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
+  fi
+  [ ! -d $dir/egs ] && mkdir -p $dir/egs/
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --left-biphone --self-loop-scale 1.0 data/langp_test $dir $dir/graph
+fi
+
+exit 0
diff --git a/egs/babel/s5d/local/chain/run_ivector_common.sh b/egs/babel/s5d/local/chain/run_ivector_common.sh
index 7354d59465b..696fd14b45f 100755
--- a/egs/babel/s5d/local/chain/run_ivector_common.sh
+++ b/egs/babel/s5d/local/chain/run_ivector_common.sh
@@ -71,7 +71,8 @@ if [ $stage -le 2 ]; then
   utils/copy_data_dir.sh data/${train_set}_sp data/${train_set}_sp_hires
   mfccdir=data/${train_set}_sp_hires/data
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
 
   # do volume-perturbation on the training data prior to extracting hires
@@ -171,7 +172,8 @@ if [ $stage -le 7 ]; then
   # valid for the non-'max2' data, the utterance list is the same.
   ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
-    utils/create_split_dir.pl /export/b{15,16,17,18}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+    utils/create_split_dir.pl \
+      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$ivectordir/storage $ivectordir/storage
   fi
   # We extract iVectors on the speed-perturbed training data after combining
   # short segments, which will be what we train the system on.  With
diff --git a/egs/babel/s5d/local/chain/run_tdnn.sh b/egs/babel/s5d/local/chain/run_tdnn.sh
index 3ce53fa9292..2d9b6db75b7 100755
--- a/egs/babel/s5d/local/chain/run_tdnn.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab1.sh b/egs/babel/s5d/local/chain/run_tdnn_bab1.sh
index db82c0f358a..0fa4020977c 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab1.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab1.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab2.sh b/egs/babel/s5d/local/chain/run_tdnn_bab2.sh
index 51387901683..ea9d5959c75 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab2.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab2.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab3.sh b/egs/babel/s5d/local/chain/run_tdnn_bab3.sh
index 098c3de0482..2973a2c9f02 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab3.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab3.sh
@@ -3,7 +3,6 @@
 
 # by default, with cleanup:
 # local/chain/run_tdnn.sh
-
 # %WER 46.7 | 19252 60586 | 57.4 26.4 16.2 4.0 46.7 31.6 | -0.469 | exp/chain_cleaned/tdnnbab3_sp_bi/decode_dev10h.pem/score_9/penalty_0.0/dev10h.pem.ctm.sys
 
 set -e -o pipefail
@@ -134,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_bab4.sh b/egs/babel/s5d/local/chain/run_tdnn_bab4.sh
index 5831cfc28f0..bd2eba9cb8b 100755
--- a/egs/babel/s5d/local/chain/run_tdnn_bab4.sh
+++ b/egs/babel/s5d/local/chain/run_tdnn_bab4.sh
@@ -133,7 +133,7 @@ fi
 if [ $stage -le 18 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
   [ ! -d $dir/egs ] && mkdir -p $dir/egs/
   touch $dir/egs/.nodelete # keep egs around when that run dies.
diff --git a/egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh b/egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..ec8366492d7
--- /dev/null
+++ b/egs/babel/s5d/local/chain/run_tdnn_lstm_1e.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+
+# From egs/swbdrun_tdnn_lstm_1e.sh
+
+set -e -o pipefail -u
+
+# configs for 'chain'
+stage=0
+nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri5_cleaned  # the gmm for the target data
+langdir=data/langp/tri5_ali
+num_threads_ubm=12
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+blstm_affix=bab1  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain_cleaned/blstm_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/blstm${blstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
+  fi
+  [ ! -d $dir/egs ] && mkdir -p $dir/egs/
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    #--trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 6 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+exit 0
diff --git a/egs/babel/s5d/local/check_tools.sh b/egs/babel/s5d/local/check_tools.sh
index ca8800def41..2c96f8445d1 100755
--- a/egs/babel/s5d/local/check_tools.sh
+++ b/egs/babel/s5d/local/check_tools.sh
@@ -18,20 +18,20 @@
 [ -f ./path.sh ] && . ./path.sh
 
 sph2pipe=`command -v sph2pipe 2>/dev/null` \
-  || { echo  >&2 "sph2pipe not found on PATH. Did you run make in the $KALDI_ROOT/tools directory?"; return 1; }
+  || { echo  >&2 "sph2pipe not found on PATH. Did you run make in the $KALDI_ROOT/tools directory?"; exit 1; }
 
 srilm=`command -v ngram 2>/dev/null` \
-  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh"; return 1; }
+  || { echo  >&2 "srilm not found on PATH. Please use the script $KALDI_ROOT/tools/extras/install_srilm.sh"; exit 1; }
 
 sox=`command -v sox 2>/dev/null` \
-  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; return 1; }
+  || { echo  >&2 "sox not found on PATH. Please install it manually (you will need version 14.4.0 and higher)."; exit 1; }
 
 # If sox is found on path, check if the version is correct
 if [ ! -z "$sox" ]; then
   sox_version=`$sox --version 2>&1| head -1 | sed -e 's?.*: ??' -e 's?.* ??'`
   if [[ ! $sox_version =~ v14.4.* ]]; then
     echo "Unsupported sox version $sox_version found on path. You will need version v14.4.0 and higher."
-    return 1
+    exit 1
   fi
 fi
 
diff --git a/egs/babel/s5d/local/extend_lexicon.sh b/egs/babel/s5d/local/extend_lexicon.sh
index c930b1729e0..41b244f110b 100755
--- a/egs/babel/s5d/local/extend_lexicon.sh
+++ b/egs/babel/s5d/local/extend_lexicon.sh
@@ -148,20 +148,10 @@ cp $input_lexicon $toplevel_dir/input_lexicon.txt  # just to have a record of wh
 
 loc=`which ngram-count`;
 if [ -z $loc ]; then
-  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
-    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
-  else
-    sdir=`pwd`/../../../tools/srilm/bin/i686
-  fi
-  if [ -f $sdir/ngram-count ]; then
-    echo Using SRILM tools from $sdir
-    export PATH=$PATH:$sdir
-  else
-    echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  See tools/install_srilm.sh for installation
-    echo instructions.
-    exit 1
-  fi
+  echo You appear to not have SRILM tools installed, either on your path,
+  echo or installed in $sdir.  See tools/install_srilm.sh for installation
+  echo instructions.
+  exit 1
 fi
 
 
@@ -231,10 +221,9 @@ if [ $stage -le -3 ]; then
 
   echo "$0: using SRILM to train syllable LM"
 
-  ngram-count -lm $dir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $dir/syllable_text.txt -sort
-
+  ngram-count -lm $dir/3gram.me.gz -maxent -maxent-convert-to-arpa  -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $dir/syllable_text.txt -sort
   rm $dir/lm.gz 2>/dev/null
-  ln -s 3gram.kn022.gz $dir/lm.gz
+  ln -s 3gram.me.gz $dir/lm.gz
 fi
 
 
diff --git a/egs/babel/s5d/local/generate_confusion_matrix.sh b/egs/babel/s5d/local/generate_confusion_matrix.sh
index 48263e729de..fb602cf0957 100755
--- a/egs/babel/s5d/local/generate_confusion_matrix.sh
+++ b/egs/babel/s5d/local/generate_confusion_matrix.sh
@@ -61,7 +61,7 @@ fi
 mkdir -p $wdir/log
 
 cat $data/phones.txt | sed 's/_[B|E|I|S]//g' |\
-  sed 's/_[%|"]//g' | sed 's/_[0-9]\+//g' > $wdir/phones.txt
+  sed 's/_[%|"]//g' | sed 's/_[0-9]\+//g' | sed 's/_[^ ]*//g' > $wdir/phones.txt
 
 echo "Converting alignments to phone sequences..."
 $cmd JOB=1:$nj $wdir/log/ali_to_phones.JOB.log \
@@ -81,7 +81,8 @@ for i in `seq 1 $nj` ; do
 done
 
 echo "Converting statistics..."
-cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g'| sort | uniq -c | \
+cat $confusion_files | cut -f 2- -d ' ' | sed 's/ *; */\n/g' | \
+  sed 's/ *$//g' | sed 's/^ *//g' | sort | uniq -c | \
   grep -v -E '<oov>|<sss>|<vns>|SIL' | \
   perl -ane '
     die unless scalar @F == 3;
diff --git a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
index b6d4b9ab944..3670ba755bc 100755
--- a/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
+++ b/egs/babel/s5d/local/lexicon/make_unicode_lexicon.py
@@ -27,7 +27,7 @@ def main():
     unicode_transcription = baseform2unicode(baseforms)
     encoded_transcription, table = encode(unicode_transcription,
                                           args.tag_percentage,
-                                          log=args.verbose)
+                                          log=args.log)
     write_table(table, args.lex_out)
     
     # Extract dictionary of nonspeech pronunciations
@@ -59,7 +59,7 @@ def parse_input():
         Parse commandline input.
     '''
     if len(sys.argv[1:]) == 0:
-        print("Usage: ./make_unicode_lexicon.py [opts] lex_in lex_out")
+        print("Usage: ./make_unicode_lexicon.py [opts] lex_in lex_out [log]")
         sys.exit(1)
 
     parser = argparse.ArgumentParser()
@@ -67,7 +67,9 @@ def parse_input():
                         "paired with a baseform. 1 word per line with the "
                         "baseform separated by a tab")
     parser.add_argument("lex_out", help="Path of output output "
-                        "graphemc lexicon")
+                        "graphemic lexicon")
+    parser.add_argument("log", nargs='?', default=None,
+                        help="Directory in which the logs will be stored");
     parser.add_argument("-F", "--fmt", help="Format of input word list",
                         action="store", default="word_list")
     parser.add_argument("-T", "--tag_percentage", help="Percentage of least"
@@ -246,12 +248,11 @@ def encode(unicode_transcription, tag_percentage, log=False):
     graph_counts = graph_counts_dict
   
     # Print grapheme counts to histogram
-    if log:
+    if log is not None:
         graph_counts_sorted = sorted(graph_counts, reverse=True,
                                      key=graph_counts.get)
-        if not os.path.exists("lex_log"):
-            os.makedirs("lex_log")
-        with codecs.open("lex_log/grapheme_histogram.txt", "w", "utf-8") as fp:
+        logfile = "{}/grapheme_histogram.txt".format(log)
+        with codecs.open(logfile, "w", "utf-8") as fp:
             fp.write("Graphemes (Count Threshold = %.6f)\n" % count_thresh)
             for g in graph_counts_sorted:
                 weight = ("-" * int(np.ceil(500.0 * graph_counts[g])) +
diff --git a/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh b/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh
index 2ffb73810e3..be6aa5c2b40 100755
--- a/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh
+++ b/egs/babel/s5d/local/nist_eval/create_new_language_configs.LLP.sh
@@ -4,15 +4,16 @@
 
 # Begin configuration section.
 language="201-haitian"
+corpus=/export/babel/data/
+indus=/export/babel/data/scoring/IndusDB
 # End configuration section
 . ./utils/parse_options.sh
 
 set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
-corpus=/export/babel/data/$language
+corpus=$corpus/$language
 lists=./conf/lists/$language/
-indus=/export/babel/data/scoring/IndusDB
 
 corpusdir=$(find $corpus -maxdepth 1 -name "*-build" -type d) || exit 1
 [ -z "$corpusdir" ] && "Corpus directory for $language not found!" && exit 1
diff --git a/egs/babel/s5d/local/nnet3/run_blstm.sh b/egs/babel/s5d/local/nnet3/run_blstm.sh
index 6833baa0d72..fcf7fb8947d 100755
--- a/egs/babel/s5d/local/nnet3/run_blstm.sh
+++ b/egs/babel/s5d/local/nnet3/run_blstm.sh
@@ -5,7 +5,7 @@ cell_dim=512
 rp_dim=128
 nrp_dim=128
 affix=bidirectional
-multicondition=true
+multicondition=false
 common_egs_dir=
 num_epochs=8
 
diff --git a/egs/babel/s5d/local/nnet3/run_ivector_common.sh b/egs/babel/s5d/local/nnet3/run_ivector_common.sh
index bfe66d13f76..7313230a7ee 100755
--- a/egs/babel/s5d/local/nnet3/run_ivector_common.sh
+++ b/egs/babel/s5d/local/nnet3/run_ivector_common.sh
@@ -60,8 +60,8 @@ fi
 if [ $stage -le 3 ]; then
   mfccdir=mfcc_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    date=$(date +'%m_%d_%H_%M')
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
 
   # the 100k_nodup directory is copied seperately, as
diff --git a/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh b/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh
index 8d3973e65bc..c3a6e1c0952 100755
--- a/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh
+++ b/egs/babel/s5d/local/nnet3/run_ivector_multicondition_common.sh
@@ -70,8 +70,8 @@ fi
 if [ $stage -le 3 ]; then
   mfccdir=mfcc_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    date=$(date +'%m_%d_%H_%M')
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/kaldi-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
 
   # the 100k_nodup directory is copied seperately, as
@@ -151,8 +151,8 @@ train_set=train_sp_mc
 if [ $stage -le 7 ]; then
   mfccdir=mfcc_reverb
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    date=$(date +'%m_%d_%H_%M')
-    utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/babel_reverb-$date/s5/$mfccdir/storage $mfccdir/storage
+    utils/create_split_dir.pl \
+      /export/b0{1,2,3,4}/$USER/kaldi-data/egs/babel_reverb-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$mfccdir/storage $mfccdir/storage
   fi
   for data_dir in $train_set; do
     utils/copy_data_dir.sh data/$data_dir data/${data_dir}_hires
diff --git a/egs/babel/s5d/local/nnet3/run_lstm.sh b/egs/babel/s5d/local/nnet3/run_lstm.sh
index 8105cfda387..f7d06501569 100755
--- a/egs/babel/s5d/local/nnet3/run_lstm.sh
+++ b/egs/babel/s5d/local/nnet3/run_lstm.sh
@@ -121,7 +121,7 @@ fi
 if [ $stage -le 13 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/train_rnn.py --stage=$train_stage \
@@ -136,7 +136,6 @@ if [ $stage -le 13 ]; then
     --trainer.optimization.final-effective-lrate=$final_effective_lrate \
     --trainer.optimization.shrink-value 0.99 \
     --trainer.rnn.num-chunk-per-minibatch=$num_chunk_per_minibatch \
-    --trainer.optimization.cv-minibatch-size 128 \
     --trainer.optimization.momentum=$momentum \
     --egs.chunk-width=$chunk_width \
     --egs.chunk-left-context=$chunk_left_context \
diff --git a/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh b/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh
index acd65e9114e..2448b1b17ff 100755
--- a/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh
+++ b/egs/babel/s5d/local/nnet3/run_lstm_realigned.sh
@@ -114,7 +114,7 @@ fi
 if [ $stage -le 3 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/train_rnn.py --stage=$train_stage \
diff --git a/egs/babel/s5d/local/nnet3/run_tdnn.sh b/egs/babel/s5d/local/nnet3/run_tdnn.sh
index 8899e363dd9..2a663486bcb 100755
--- a/egs/babel/s5d/local/nnet3/run_tdnn.sh
+++ b/egs/babel/s5d/local/nnet3/run_tdnn.sh
@@ -60,7 +60,7 @@ local/nnet3/run_ivector_common.sh --stage $stage \
 if [ $stage -le 9 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage
   fi
 
   steps/nnet3/train_tdnn.sh --stage $train_stage \
diff --git a/egs/babel/s5d/local/reestimate_langp.sh b/egs/babel/s5d/local/reestimate_langp.sh
index 059fba52043..ae70b6a8f46 100755
--- a/egs/babel/s5d/local/reestimate_langp.sh
+++ b/egs/babel/s5d/local/reestimate_langp.sh
@@ -29,5 +29,6 @@ utils/dict_dir_add_pronprobs.sh --max-normalize true $idict  \
   $amdir/pron_bigram_counts_nowb.txt $odict
 
 utils/prepare_lang.sh  --phone-symbol-table $langdir/phones.txt \
+  --share-silence-phones true \
   $odict "$unk" $olocallang $olang
 
diff --git a/egs/babel/s5d/local/run_kws_stt_task2.sh b/egs/babel/s5d/local/run_kws_stt_task2.sh
index 6007baa1756..9c10bfe6da5 100755
--- a/egs/babel/s5d/local/run_kws_stt_task2.sh
+++ b/egs/babel/s5d/local/run_kws_stt_task2.sh
@@ -71,14 +71,26 @@ fi
 if ! $skip_kws ; then
   [ ! -f $data_dir/extra_kws_tasks ] && exit 0
 
-  syll_data_dir=$(echo $data_dir | perl -pe 's/\.(pem|seg)$/.syll.$1/g' )
+  idata=$(basename $data_dir)
+  idir=$(dirname $data_dir)
+
+  idataset=${idata%%.*}
+  idatatype=${idata#*.}
+
+  if [ "$idata" == "$idataset" ]; then
+    syll_data_dir=$idir/${idataset}.syll
+    phn_data_dir=$idir/${idataset}.phn
+  else
+    syll_data_dir=$idir/${idataset}.syll.${idatatype}
+    phn_data_dir=$idir/${idataset}.phn.${idatatype}
+  fi
+
   if [ -d ${syll_data_dir} ] && [ ! -f ${decode_dir}/syllabs/.done ] ; then
     local/syllab/lattice_word2syll.sh --cmd "$cmd --mem 8G" \
       $data_dir $lang_dir ${lang_dir}.syll $decode_dir ${decode_dir}/syllabs
     touch ${decode_dir}/syllabs/.done
   fi
 
-  phn_data_dir=$(echo $data_dir | perl -pe 's/\.(pem|seg)$/.phn.$1/g' )
   if [ -d ${phn_data_dir} ] && [ ! -f ${decode_dir}/phones/.done ] ; then
     local/syllab/lattice_word2syll.sh --cmd "$cmd --mem 8G" \
       $data_dir $lang_dir ${lang_dir}.phn $decode_dir ${decode_dir}/phones
diff --git a/egs/babel/s5d/local/search/run_phn_search.sh b/egs/babel/s5d/local/search/run_phn_search.sh
index 44587699a38..e4dba529b3d 100755
--- a/egs/babel/s5d/local/search/run_phn_search.sh
+++ b/egs/babel/s5d/local/search/run_phn_search.sh
@@ -29,7 +29,11 @@ dataset=${dir%%.*}
 datatype=${dir#*.}
 
 lang=data/lang.phn
-data=data/${dataset}.phn.${datatype}
+if [ "$dir" == "$dataset" ]; then
+  data=data/${dataset}.phn
+else
+  data=data/${dataset}.phn.${datatype}
+fi
 
 set +o nounset
 eval kwsets=${!dataset_kwlists[@]}
@@ -76,7 +80,7 @@ if [ $stage -le 2 ] ; then
       ${data}/kwset_${set}/tmp.4
 
     # and finally, replace the categories by the word-level categories
-    cp data/$dir/kwset_${set}/categories $data/kwset_${set}/categories
+    cp data/${dir}/kwset_${set}/categories $data/kwset_${set}/categories
   done
 fi
 
diff --git a/egs/babel/s5d/local/search/run_search.sh b/egs/babel/s5d/local/search/run_search.sh
index 2cb40cabb59..1fbdb071123 100755
--- a/egs/babel/s5d/local/search/run_search.sh
+++ b/egs/babel/s5d/local/search/run_search.sh
@@ -67,8 +67,11 @@ if [ $stage -le 2 ] ; then
     #--   data/dev10h.pem/${set}_oov_kws/tmp/L1.lex data/dev10h.pem/kwset_${set}/tmp.3
     if [ -d data/local/extend ]; then
       echo "Detected extended lexicon system..."
-      local/search/compile_proxy_keywords.sh --cmd "$decode_cmd --mem 12G" --filter "OOV=1&&Characters>2"\
-        --beam 5 --nbest 50 --nj 64 --phone-beam 5 --phone-nbest 300  --confusion-matrix exp/conf_matrix/confusions.txt  \
+      local/search/compile_proxy_keywords.sh --filter "OOV=1&&Characters>2"\
+        --cmd "$decode_cmd --mem 24G --max-jobs-run 64" --nj 128 \
+        --beam $extlex_proxy_beam --nbest $extlex_proxy_nbest \
+        --phone-beam $extlex_proxy_phone_beam --phone-nbest $extlex_proxy_phone_nbest\
+        --confusion-matrix exp/conf_matrix/confusions.txt  \
         data/$dir/kwset_${set} data/lang data/local/lexiconp.txt exp/g2p \
         data/$dir/kwset_${set}/tmp.4
     else
diff --git a/egs/babel/s5d/local/search/run_syll_search.sh b/egs/babel/s5d/local/search/run_syll_search.sh
index eb48d836e77..41a925ce13a 100755
--- a/egs/babel/s5d/local/search/run_syll_search.sh
+++ b/egs/babel/s5d/local/search/run_syll_search.sh
@@ -29,7 +29,11 @@ dataset=${dir%%.*}
 datatype=${dir#*.}
 
 lang=data/lang.syll
-data=data/${dataset}.syll.${datatype}
+if [ "$dir" == "$dataset" ]; then
+  data=data/${dataset}.syll
+else
+  data=data/${dataset}.syll.${datatype}
+fi
 
 set +o nounset
 eval kwsets=${!dataset_kwlists[@]}
diff --git a/egs/babel/s5d/local/search/search.sh b/egs/babel/s5d/local/search/search.sh
index 200a49d8e86..6a5b2d35a97 100755
--- a/egs/babel/s5d/local/search/search.sh
+++ b/egs/babel/s5d/local/search/search.sh
@@ -26,6 +26,7 @@ silence_word=  # specify this if you did to in kws_setup.sh, it's more accurate.
 strict=false
 duptime=0.6
 ntrue_scale=1.0
+frame_subsampling_factor=1
 nbest=-1
 max_silence_frames=50
 # End configuration section.
diff --git a/egs/babel/s5d/local/syllab/lattice_word2syll.sh b/egs/babel/s5d/local/syllab/lattice_word2syll.sh
index b81bf9d18d4..63e9114875d 100755
--- a/egs/babel/s5d/local/syllab/lattice_word2syll.sh
+++ b/egs/babel/s5d/local/syllab/lattice_word2syll.sh
@@ -26,7 +26,7 @@ mkdir -p $output/log
 
 if [ -f $olang/lex.words2syllabs.fst ] ; then
   fstinvert $olang/lex.words2syllabs.fst | fstreverse | \
-    fstminimize | fstreverse > $output/L.fst
+    fstminimize --allow_nondet | fstreverse > $output/L.fst
 
   $cmd JOB=1:$nj $output/log/convert.JOB.log \
     lattice-push --push-strings ark:"gunzip -c $input/lat.JOB.gz|" ark:- \| \
diff --git a/egs/babel/s5d/local/syllab/run_phones.sh b/egs/babel/s5d/local/syllab/run_phones.sh
index 6f3c7be4cef..7c4a13c61f9 100755
--- a/egs/babel/s5d/local/syllab/run_phones.sh
+++ b/egs/babel/s5d/local/syllab/run_phones.sh
@@ -21,10 +21,20 @@ if [ $# -ne 1 ] ; then
 fi
 
 idir=$1
+
+if [ ! -d "$idir" ] ; then
+  echo "The directory $idir does not exist"
+  exit 1
+fi
+
 idata=${idir##*/}
 
 
-odata=${idata%%.*}.phn.${idata#*.}
+if [ "$idata" == ${idata%%.*} ]; then
+  odata=${idata%%.*}.phn
+else
+  odata=${idata%%.*}.phn.${idata#*.}
+fi
 
 if [ $stage -le -1 ] ; then
   local/syllab/generate_phone_lang.sh \
diff --git a/egs/babel/s5d/local/syllab/run_syllabs.sh b/egs/babel/s5d/local/syllab/run_syllabs.sh
index a2ec82f3033..7366ac9ad35 100755
--- a/egs/babel/s5d/local/syllab/run_syllabs.sh
+++ b/egs/babel/s5d/local/syllab/run_syllabs.sh
@@ -21,10 +21,19 @@ if [ $# -ne 1 ] ; then
 fi
 
 idir=$1
-idata=${idir##*/}
 
+if [ ! -d "$idir" ] ; then
+  echo "The directory $idir does not exist"
+  exit 1
+fi
+
+idata=${idir##*/}
 
-odata=${idata%%.*}.syll.${idata#*.}
+if [ "$idata" == ${idata%%.*} ]; then
+  odata=${idata%%.*}.syll
+else
+  odata=${idata%%.*}.syll.${idata#*.}
+fi
 
 if [ $stage -le -1 ] ; then
   local/syllab/generate_syllable_lang.sh \
@@ -45,7 +54,7 @@ if [ $stage -le -1 ] ; then
   local/arpa2G.sh  data/srilm.syll/lm.gz  data/lang.syll/ data/lang.syll/
 fi
 
-if [ $stage -le 0 ] && [ -f "$idir/text" ] ; then
+if [ $stage -le 0 ] && [ -f "$idir/text" ]; then
   #Create dev10h.syll.pem dir
   steps/align_fmllr.sh \
       --boost-silence $boost_sil --nj $train_nj --cmd "$train_cmd" \
diff --git a/egs/babel/s5d/run-1-main-unicode-extend-lex.sh b/egs/babel/s5d/run-1-main-unicode-extend-lex.sh
new file mode 100755
index 00000000000..f9de3e8e947
--- /dev/null
+++ b/egs/babel/s5d/run-1-main-unicode-extend-lex.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+
+# Parameters for extended lexicon.
+extend_lexicon=true
+unk_fraction_boost=1.0
+num_sent_gen=12000000
+num_prons=1000000
+morfessor=true
+tag_percentage=0.1
+denlats_only=false
+
+[ ! -f ./lang.conf ] && echo 'Language configuration does not exist! Use the configurations in conf/lang/* as a startup' && exit 1
+[ ! -f ./conf/common_vars.sh ] && echo 'the file conf/common_vars.sh does not exist!' && exit 1
+
+. conf/common_vars.sh || exit 1;
+. ./lang.conf || exit 1;
+
+[ -f local.conf ] && . ./local.conf
+
+. ./utils/parse_options.sh
+
+set -e           #Exit on non-zero return code from any command
+set -o pipefail  #Exit if any of the commands in the pipeline will
+                 #return non-zero return code
+#set -u           #Fail on an undefined variable
+
+lexicon=data/local/lexicon.txt
+if $extend_lexicon; then
+  lexicon=data/local/lexiconp.txt
+fi
+
+./local/check_tools.sh || exit 1
+
+#Preparing dev2h and train directories
+if [ ! -f data/raw_train_data/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Subsetting the TRAIN set"
+    echo ---------------------------------------------------------------------
+
+    local/make_corpus_subset.sh "$train_data_dir" "$train_data_list" ./data/raw_train_data
+    train_data_dir=`readlink -f ./data/raw_train_data`
+    touch data/raw_train_data/.done
+fi
+nj_max=`cat $train_data_list | wc -l`
+if [[ "$nj_max" -lt "$train_nj" ]] ; then
+    echo "The maximum reasonable number of jobs is $nj_max (you have $train_nj)! (The training and decoding process has file-granularity)"
+    exit 1;
+    train_nj=$nj_max
+fi
+train_data_dir=`readlink -f ./data/raw_train_data`
+
+if [ ! -d data/raw_dev2h_data ]; then
+  echo ---------------------------------------------------------------------
+  echo "Subsetting the DEV2H set"
+  echo ---------------------------------------------------------------------
+  local/make_corpus_subset.sh "$dev2h_data_dir" "$dev2h_data_list" ./data/raw_dev2h_data || exit 1
+fi
+
+if [ ! -d data/raw_dev10h_data ]; then
+  echo ---------------------------------------------------------------------
+  echo "Subsetting the DEV10H set"
+  echo ---------------------------------------------------------------------
+  local/make_corpus_subset.sh "$dev10h_data_dir" "$dev10h_data_list" ./data/raw_dev10h_data || exit 1
+fi
+
+# Move data/dev2h preparation forward so we can get data/dev2h/text for
+# diagnostic purpose when extending the lexicon.
+if [[ ! -f data/dev2h/wav.scp || data/dev2h/wav.scp -ot ./data/raw_dev2h_data/audio ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing dev2h data lists in data/dev2h on" `date`
+  echo ---------------------------------------------------------------------
+  mkdir -p data/dev2h
+  local/prepare_acoustic_training_data.pl \
+    --fragmentMarkers \-\*\~ \
+    `pwd`/data/raw_dev2h_data data/dev2h > data/dev2h/skipped_utts.log || exit 1
+fi
+
+if [[ ! -f data/dev2h/glm || data/dev2h/glm -ot "$glmFile" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing dev2h stm files in data/dev2h on" `date`
+  echo ---------------------------------------------------------------------
+  if [ -z $dev2h_stm_file ]; then
+    echo "WARNING: You should define the variable stm_file pointing to the IndusDB stm"
+    echo "WARNING: Doing that, it will give you scoring close to the NIST scoring.    "
+    local/prepare_stm.pl --fragmentMarkers \-\*\~ data/dev2h || exit 1
+  else
+    local/augment_original_stm.pl $dev2h_stm_file data/dev2h || exit 1
+  fi
+  [ ! -z $glmFile ] && cp $glmFile data/dev2h/glm
+
+fi
+
+mkdir -p data/local
+if [[ ! -f $lexicon || $lexicon -ot "$lexicon_file" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing lexicon in data/local on" `date`
+  echo ---------------------------------------------------------------------
+
+  local/lexicon/make_word_list.py $train_data_dir/filelist.list $train_data_dir/transcription data/local/word_list.txt
+  echo -e "<silence> SIL\n<unk> <oov>\n<noise> <sss>\n<v-noise> <vns>" > data/local/nonspeech.txt
+  echo -e "<hes> <hes>" > data/local/extraspeech.txt
+
+  fmt="word_list"
+  if $morfessor; then
+    fmt="morfessor"
+    morfessor-train --encoding=utf_8 --traindata-list -f"-_" -s data/local/morfessor.bin \
+      data/local/word_list.txt
+    morfessor-segment --encoding=utf_8 --output-format-separator '.' --viterbi-maxlen 3 \
+      -l data/local/morfessor.bin <(cut -d' ' -f2 data/local/word_list.txt) \
+      | sed 's/\.[\_\-]\././g' > data/local/segments
+    cut -d' ' data/local/word_list.txt -f2 | paste -d' ' - data/local/segments > data/local/word_list_tmp.txt
+    mv data/local/word_list_tmp.txt data/local/word_list.txt
+  fi
+
+  local/lexicon/make_unicode_lexicon.py --tag_percentage $tag_percentage --fmt $fmt \
+    --nonspeech data/local/nonspeech.txt --extraspeech data/local/extraspeech.txt \
+    --verbose data/local/word_list.txt data/local/lexicon.txt data/local/
+  local/prepare_unicode_lexicon.py --nonspeech data/local/nonspeech.txt \
+    --extraspeech data/local/extraspeech.txt data/local/lexicon_table.txt data/local
+  cp data/local/lexicon.txt data/local/filtered_lexicon.txt
+  if $extend_lexicon; then
+    # Extend the original lexicon.
+    # Will creates the files data/local/extend/{lexiconp.txt,oov2prob}.
+    mv data/local/lexicon.txt  data/local/lexicon_orig.txt
+    local/extend_lexicon.sh --cmd "$train_cmd" --cleanup false \
+      --num-sent-gen $num_sent_gen --num-prons $num_prons \
+      data/local/lexicon_orig.txt data/local/extend data/dev2h/text
+    cp data/local/extend/lexiconp.txt data/local/
+  fi
+fi
+
+mkdir -p data/lang
+if [[ ! -f data/lang/L.fst || data/lang/L.fst -ot $lexicon ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Creating L.fst etc in data/lang on" `date`
+  echo ---------------------------------------------------------------------
+  utils/prepare_lang.sh \
+    --share-silence-phones true \
+    data/local $oovSymbol data/local/tmp.lang data/lang
+fi
+
+if [[ ! -f data/train/wav.scp || data/train/wav.scp -ot "$train_data_dir" ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Preparing acoustic training lists in data/train on" `date`
+  echo ---------------------------------------------------------------------
+  mkdir -p data/train
+  local/prepare_acoustic_training_data.pl \
+    --vocab $lexicon --fragmentMarkers \-\*\~ \
+    $train_data_dir data/train > data/train/skipped_utts.log
+fi
+
+if [[ ! -f data/srilm/lm.gz || data/srilm/lm.gz -ot data/train/text ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Training SRILM language models on" `date`
+  echo ---------------------------------------------------------------------
+  # If extending the lexicon, use "--words-file data/local/lexicon_orig.txt" so
+  # that the LM is trained just on the vocab that appears in the text. Will add
+  # in the OOVs later.
+  words_file_param=()
+  if $extend_lexicon; then
+    words_file_param=(--words-file data/local/lexicon_orig.txt)
+  fi
+  local/train_lms_srilm.sh  --oov-symbol "$oovSymbol"\
+    "${words_file_param[@]}" \
+    --train-text data/train/text data data/srilm
+fi
+
+if [[ ! -f data/lang/G.fst || data/lang/G.fst -ot data/srilm/lm.gz ||\
+  ( -f data/local/extend/oov2prob &&\
+  data/lang/G.fst -ot data/local/extend/oov2prob ) ]]; then
+  echo ---------------------------------------------------------------------
+  echo "Creating G.fst on " `date`
+  echo ---------------------------------------------------------------------
+  extend_lexicon_param=()
+  if $extend_lexicon; then
+    [ -f data/local/extend/original_oov_rates ] || exit 1;
+    unk_fraction=`cat data/local/extend/original_oov_rates |\
+      grep "token" | awk -v x=$unk_fraction_boost '{print $NF/100.0*x}'`
+    extend_lexicon_param=(--cleanup false --unk-fraction $unk_fraction \
+      --oov-prob-file data/local/extend/oov2prob)
+  fi
+  local/arpa2G.sh ${extend_lexicon_param[@]} \
+    data/srilm/lm.gz data/lang data/lang
+fi
+
+echo ---------------------------------------------------------------------
+echo "Starting plp feature extraction for data/train in plp on" `date`
+echo ---------------------------------------------------------------------
+
+if [ ! -f data/train/.plp.done ]; then
+  if $use_pitch; then
+    steps/make_plp_pitch.sh --cmd "$train_cmd" --nj $train_nj data/train exp/make_plp_pitch/train plp
+  else
+    steps/make_plp.sh --cmd "$train_cmd" --nj $train_nj data/train exp/make_plp/train plp
+  fi
+  utils/fix_data_dir.sh data/train
+  steps/compute_cmvn_stats.sh data/train exp/make_plp/train plp
+  utils/fix_data_dir.sh data/train
+  touch data/train/.plp.done
+fi
+
+touch data/.extlex
+mkdir -p exp
+
+echo -------------------------------------------------------------------------
+echo "Extended lexicon finished on" `date`. Now running script run-1-main.sh
+echo -------------------------------------------------------------------------
+./run-1-main-unicode.sh --denlats-only "$denlats_only"
+exit 0
diff --git a/egs/babel/s5d/run-1-main-unicode.sh b/egs/babel/s5d/run-1-main-unicode.sh
index e3fb2486c83..acd2693cbef 100755
--- a/egs/babel/s5d/run-1-main-unicode.sh
+++ b/egs/babel/s5d/run-1-main-unicode.sh
@@ -80,7 +80,7 @@ if [[ ! -f $lexicon || $lexicon -ot "$lexicon_file" ]]; then
 
   local/lexicon/make_unicode_lexicon.py --tag_percentage $tag_percentage --fmt $fmt \
     --nonspeech data/local/nonspeech.txt --extraspeech data/local/extraspeech.txt \
-    --verbose data/local/word_list.txt data/local/lexicon.txt
+    --verbose data/local/word_list.txt data/local/lexicon.txt data/local/
   local/prepare_unicode_lexicon.py --nonspeech data/local/nonspeech.txt \
     --extraspeech data/local/extraspeech.txt data/local/lexicon_table.txt data/local
   cp data/local/lexicon.txt data/local/filtered_lexicon.txt
diff --git a/egs/babel/s5d/run-4-anydecode.sh b/egs/babel/s5d/run-4-anydecode.sh
index 083ac7e9879..8ac0fde2621 100755
--- a/egs/babel/s5d/run-4-anydecode.sh
+++ b/egs/babel/s5d/run-4-anydecode.sh
@@ -26,7 +26,7 @@ extra_left_context=40
 extra_right_context=40
 frames_per_chunk=20
 
-echo "run-4-test.sh $@"
+echo "$0 $@"
 
 . utils/parse_options.sh
 
@@ -61,7 +61,9 @@ dataset_type=${dir%%.*}
 #By default, we want the script to accept how the dataset should be handled,
 #i.e. of  what kind is the dataset
 if [ -z ${kind} ] ; then
-  if [ "$dataset_type" == "dev2h" ] || [ "$dataset_type" == "dev10h" ]; then
+  if [ "$dataset_type" == "dev2h" ] || \
+    [ "$dataset_type" == "dev10h" ] || \
+    [ "$dataset_type" == "train" ]; then
     dataset_kind=supervised
   else
     dataset_kind=unsupervised
@@ -96,11 +98,24 @@ if [ -z $my_data_dir ] || [ -z $my_data_list ] ; then
   exit 1
 fi
 
+if [ "$dataset_type" == "train" ] ;  then
+  local/ali_to_rttm.sh --cmd "$decode_cmd" data/train  data/langp_test exp/tri5_ali
+  bash -x  local/qbe/wav_to_ecf.sh  data/train/wav.scp > data/train/ecf.train.xml
+  train_rttm_file=./exp/tri5_ali/rttm
+  train_ecf_file=./data/train/ecf.train.xml
+fi
+
+
 eval my_stm_file=\$${dataset_type}_stm_file
 eval my_ecf_file=\$${dataset_type}_ecf_file
 eval my_rttm_file=\$${dataset_type}_rttm_file
 eval my_nj=\$${dataset_type}_nj  #for shadow, this will be re-set when appropriate
 
+echo "my_stm_file=$my_stm_file"
+echo "my_ecf_file=$my_ecf_file"
+echo "my_rttm_file=$my_rttm_file"
+echo "my_nj=$my_nj"
+
 if [ -z "$my_nj" ]; then
   echo >&2 "You didn't specify the number of jobs -- variable \"${dataset_type}_nj\" not defined."
   exit 1
@@ -214,7 +229,8 @@ if [ ! -f  $dataset_dir/.done ] ; then
       . ./local/datasets/supervised_seg.sh || exit 1
     elif [ "$dataset_segments" == "uem" ]; then
       . ./local/datasets/supervised_uem.sh || exit 1
-    elif [ "$dataset_segments" == "pem" ]; then
+    elif [ "$dataset_segments" == "train" ] ||\
+         [ "$dataset_segments" == "pem" ]; then
       . ./local/datasets/supervised_pem.sh || exit 1
     else
       echo "Unknown type of the dataset: \"$dataset_segments\"!";
@@ -294,29 +310,31 @@ echo ---------------------------------------------------------------------
 echo "Preparing kws data files in ${dataset_dir} on" `date`
 echo ---------------------------------------------------------------------
 lang=data/lang
-if ! $skip_kws ; then
-  if  $extra_kws ; then
-    L1_lex=data/local/lexiconp.txt
-    . ./local/datasets/extra_kws.sh || exit 1
-  fi
-  if  $vocab_kws ; then
-    . ./local/datasets/vocab_kws.sh || exit 1
-  fi
-  if [ ! -f data/lang.phn/G.fst ] ; then
-    ./local/syllab/run_phones.sh --stage -2 ${dataset_dir}
-  else
-    ./local/syllab/run_phones.sh ${dataset_dir}
-  fi
+if [ ! -f data/dev10h.pem/.done.kws.dev ] ; then
+  if ! $skip_kws  ; then
+    if  $extra_kws ; then
+      L1_lex=data/local/lexiconp.txt
+      . ./local/datasets/extra_kws.sh || exit 1
+    fi
+    if  $vocab_kws ; then
+      . ./local/datasets/vocab_kws.sh || exit 1
+    fi
+    if [ ! -f data/lang.phn/G.fst ] ; then
+      ./local/syllab/run_phones.sh --stage -2 ${dataset_dir}
+    else
+      ./local/syllab/run_phones.sh ${dataset_dir}
+    fi
 
-  if [ ! -f data/lang.syll/G.fst ] ; then
-    ./local/syllab/run_syllabs.sh --stage -2  ${dataset_dir}
-  else
-    ./local/syllab/run_syllabs.sh ${dataset_dir}
-  fi
+    if [ ! -f data/lang.syll/G.fst ] ; then
+      ./local/syllab/run_syllabs.sh --stage -2  ${dataset_dir}
+    else
+      ./local/syllab/run_syllabs.sh ${dataset_dir}
+    fi
 
-  ./local/search/run_search.sh --dir ${dataset_dir##*/}
-  ./local/search/run_phn_search.sh --dir ${dataset_dir##*/}
-  ./local/search/run_syll_search.sh --dir ${dataset_dir##*/}
+    ./local/search/run_search.sh --dir ${dataset_dir##*/}
+    ./local/search/run_phn_search.sh --dir ${dataset_dir##*/}
+    ./local/search/run_syll_search.sh --dir ${dataset_dir##*/}
+  fi
 fi
 
 if $data_only ; then
@@ -379,72 +397,6 @@ if $tri5_only; then
   exit 0
 fi
 
-####################################################################
-## SGMM2 decoding
-## We Include the SGMM_MMI inside this, as we might only have the DNN systems
-## trained and not PLP system. The DNN systems build only on the top of tri5 stage
-####################################################################
-if [ -f exp/sgmm5/.done ]; then
-  decode=exp/sgmm5/decode_fmllr_${dataset_id}
-  if [ ! -f $decode/.done ]; then
-    echo ---------------------------------------------------------------------
-    echo "Spawning $decode on" `date`
-    echo ---------------------------------------------------------------------
-    utils/mkgraph.sh \
-      data/langp_test exp/sgmm5 exp/sgmm5/graph |tee exp/sgmm5/mkgraph.log
-
-    mkdir -p $decode
-    steps/decode_sgmm2.sh --skip-scoring true --use-fmllr true --nj $my_nj \
-      --cmd "$decode_cmd" --transform-dir exp/tri5/decode_${dataset_id} "${decode_extra_opts[@]}"\
-      exp/sgmm5/graph ${dataset_dir} $decode |tee $decode/decode.log
-    touch $decode/.done
-
-    if ! $fast_path ; then
-      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
-        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
-        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
-        "${lmwt_plp_extra_opts[@]}" \
-        ${dataset_dir} data/langp_test  exp/sgmm5/decode_fmllr_${dataset_id}
-    fi
-  fi
-
-  ####################################################################
-  ##
-  ## SGMM_MMI rescoring
-  ##
-  ####################################################################
-
-  for iter in 1 2 3 4; do
-      # Decode SGMM+MMI (via rescoring).
-    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
-    if [ -x exp/sgmm5_mmi_b0.1 ] && [ ! -f $decode/.done ]; then
-
-      mkdir -p $decode
-      steps/decode_sgmm2_rescore.sh  --skip-scoring true \
-        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5/decode_${dataset_id} \
-        data/langp_test ${dataset_dir} exp/sgmm5/decode_fmllr_${dataset_id} $decode | tee ${decode}/decode.log
-
-      touch $decode/.done
-    fi
-  done
-
-  #We are done -- all lattices has been generated. We have to
-  #a)Run MBR decoding
-  #b)Run KW search
-  for iter in 1 2 3 4; do
-    # Decode SGMM+MMI (via rescoring).
-    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
-    if [ -f $decode/.done ]; then
-      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
-        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
-        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
-      "${lmwt_plp_extra_opts[@]}" \
-      ${dataset_dir} data/langp_test $decode
-    fi
-  done
-fi
-
-
 
 ####################################################################
 ##
@@ -476,10 +428,13 @@ fi
 ## nnet3 model decoding
 ##
 ####################################################################
-if [ -f exp/nnet3/lstm_bidirectional_sp/.done ]; then
+if [ -f exp/nnet3/lstm_bidirectional_sp/final.mdl ]; then
   decode=exp/nnet3/lstm_bidirectional_sp/decode_${dataset_id}
   rnn_opts=" --extra-left-context 40 --extra-right-context 40  --frames-per-chunk 20 "
   decode_script=steps/nnet3/decode.sh
+  my_nj_backup=$my_nj
+  echo "Modifying the number of jobs as this is an RNN and decoding can be extremely slow."
+  my_nj=`cat ${dataset_dir}_hires/spk2utt|wc -l`
   if [ ! -f $decode/.done ]; then
     mkdir -p $decode
     $decode_script --nj $my_nj --cmd "$decode_cmd" $rnn_opts \
@@ -496,9 +451,11 @@ if [ -f exp/nnet3/lstm_bidirectional_sp/.done ]; then
     --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
     "${lmwt_dnn_extra_opts[@]}" \
     ${dataset_dir} data/langp_test $decode
+
+  my_nj=$my_nj_backup
 fi
 
-if [ -f exp/nnet3/lstm_realigned_bidirectional_sp//.done ]; then
+if [ -f exp/nnet3/lstm_realigned_bidirectional_sp/final.mdl ]; then
   decode=exp/nnet3/lstm_realigned_bidirectional_sp//decode_${dataset_id}
   rnn_opts=" --extra-left-context 40 --extra-right-context 40  --frames-per-chunk 20 "
   decode_script=steps/nnet3/decode.sh
@@ -519,7 +476,7 @@ if [ -f exp/nnet3/lstm_realigned_bidirectional_sp//.done ]; then
     "${lmwt_dnn_extra_opts[@]}" \
     ${dataset_dir} data/langp_test $decode
 fi
-if [ -f exp/nnet3/lstm_sp/.done ]; then
+if [ -f exp/nnet3/lstm_sp/final.mdl ]; then
   decode=exp/nnet3/lstm_sp/decode_${dataset_id}
   rnn_opts=" --extra-left-context 40 --extra-right-context 0  --frames-per-chunk 20 "
   decode_script=steps/nnet3/decode.sh
@@ -541,7 +498,7 @@ if [ -f exp/nnet3/lstm_sp/.done ]; then
     ${dataset_dir} data/langp_test $decode
 fi
 
-if [ -f exp/$nnet3_model/.done ]; then
+if [ -f exp/$nnet3_model/final.mdl ]; then
   decode=exp/$nnet3_model/decode_${dataset_id}
   rnn_opts=
   decode_script=steps/nnet3/decode.sh
@@ -583,6 +540,7 @@ if [ -f exp/$chain_model/final.mdl ]; then
     touch exp/nnet3$parent_dir_suffix/ivectors_${dataset_id}/.done
   fi
 
+  my_nj_backup=$my_nj
   rnn_opts=
   if [ "$is_rnn" == "true" ]; then
     rnn_opts=" --extra-left-context $extra_left_context --extra-right-context $extra_right_context  --frames-per-chunk $frames_per_chunk "
@@ -608,6 +566,7 @@ if [ -f exp/$chain_model/final.mdl ]; then
     --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
     "${lmwt_chain_extra_opts[@]}" \
     ${dataset_dir} data/langp_test $decode
+  my_nj=$my_nj_backup
 else
   echo "no chain model exp/$chain_model"
 fi
@@ -720,5 +679,72 @@ for dnn in tri6_nnet_semi_supervised tri6_nnet_semi_supervised2 \
       ${dataset_dir} data/langp_test $decode
   fi
 done
+
+####################################################################
+## SGMM2 decoding
+## We Include the SGMM_MMI inside this, as we might only have the DNN systems
+## trained and not PLP system. The DNN systems build only on the top of tri5 stage
+####################################################################
+if [ -f exp/sgmm5/.done ]; then
+  decode=exp/sgmm5/decode_fmllr_${dataset_id}
+  if [ ! -f $decode/.done ]; then
+    echo ---------------------------------------------------------------------
+    echo "Spawning $decode on" `date`
+    echo ---------------------------------------------------------------------
+    utils/mkgraph.sh \
+      data/langp_test exp/sgmm5 exp/sgmm5/graph |tee exp/sgmm5/mkgraph.log
+
+    mkdir -p $decode
+    steps/decode_sgmm2.sh --skip-scoring true --use-fmllr true --nj $my_nj \
+      --cmd "$decode_cmd" --transform-dir exp/tri5/decode_${dataset_id} "${decode_extra_opts[@]}"\
+      exp/sgmm5/graph ${dataset_dir} $decode |tee $decode/decode.log
+    touch $decode/.done
+
+    if ! $fast_path ; then
+      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
+        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
+        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
+        "${lmwt_plp_extra_opts[@]}" \
+        ${dataset_dir} data/langp_test  exp/sgmm5/decode_fmllr_${dataset_id}
+    fi
+  fi
+
+  ####################################################################
+  ##
+  ## SGMM_MMI rescoring
+  ##
+  ####################################################################
+
+  for iter in 1 2 3 4; do
+      # Decode SGMM+MMI (via rescoring).
+    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
+    if [ -x exp/sgmm5_mmi_b0.1 ] && [ ! -f $decode/.done ]; then
+
+      mkdir -p $decode
+      steps/decode_sgmm2_rescore.sh  --skip-scoring true \
+        --cmd "$decode_cmd" --iter $iter --transform-dir exp/tri5/decode_${dataset_id} \
+        data/langp_test ${dataset_dir} exp/sgmm5/decode_fmllr_${dataset_id} $decode | tee ${decode}/decode.log
+
+      touch $decode/.done
+    fi
+  done
+
+  #We are done -- all lattices has been generated. We have to
+  #a)Run MBR decoding
+  #b)Run KW search
+  for iter in 1 2 3 4; do
+    # Decode SGMM+MMI (via rescoring).
+    decode=exp/sgmm5_mmi_b0.1/decode_fmllr_${dataset_id}_it$iter
+    if [ -f $decode/.done ]; then
+      local/run_kws_stt_task2.sh --cer $cer --max-states $max_states \
+        --skip-scoring $skip_scoring --extra-kws $extra_kws --wip $wip \
+        --cmd "$decode_cmd" --skip-kws $skip_kws --skip-stt $skip_stt  \
+      "${lmwt_plp_extra_opts[@]}" \
+      ${dataset_dir} data/langp_test $decode
+    fi
+  done
+fi
+
+
 echo "Everything looking good...."
 exit 0
diff --git a/egs/callhome_egyptian/s5/run.sh b/egs/callhome_egyptian/s5/run.sh
index 9d1fa692da0..4d1359bea98 100755
--- a/egs/callhome_egyptian/s5/run.sh
+++ b/egs/callhome_egyptian/s5/run.sh
@@ -29,7 +29,7 @@ local/callhome_prepare_dict.sh $eca_lexicon
 utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
 # Make sure that you do not use your test and your dev sets to train the LM
-# Some form of cross validation is possible where you decode your dev/set based on an 
+# Some form of cross validation is possible where you decode your dev/set based on an
 # LM that is trained on  everything but that that conversation
 local/callhome_train_lms.sh $split
 local/callhome_create_test_lang.sh
@@ -100,7 +100,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
    exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
 )&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
@@ -108,7 +108,7 @@ steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
 
 steps/train_sat.sh  --cmd "$train_cmd" \
   2200 25000 data/train data/lang exp/tri3a_ali  exp/tri4a || exit 1;
-                                                                                 
+
 (
   utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -140,9 +140,9 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 )&
 
 dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
-                       --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+                       --parallel-opts "--num-threads 16" --cmd "queue.pl  --mem 1G")
 dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
-                       --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 -l mem_free=2G,ram_free=1G")
+                       --parallel-opts "--gpu 1" --cmd "queue.pl  --mem 1G")
 
 steps/nnet2/train_pnorm_ensemble.sh \
   --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
@@ -153,17 +153,17 @@ steps/nnet2/train_pnorm_ensemble.sh \
   data/train data/lang exp/tri5a_ali exp/tri6a_dnn
 
 (
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
 ) &
 
 # Decode test sets
 (
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_test exp/tri5a/graph data/test exp/tri6a_dnn/decode_test
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_sup exp/tri5a/graph data/sup exp/tri6a_dnn/decode_sup
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_h5 exp/tri5a/graph data/h5 exp/tri6a_dnn/decode_h5
 ) &
 
diff --git a/egs/chime3/s5/cmd.sh b/egs/chime3/s5/cmd.sh
index 7ee5fbcd73d..cf2570db1a9 100755
--- a/egs/chime3/s5/cmd.sh
+++ b/egs/chime3/s5/cmd.sh
@@ -6,9 +6,9 @@
 # the number of cpus on your machine.
 
 #a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
+#export train_cmd="queue.pl"
+#export decode_cmd="queue.pl --mem 4G"
+#export mkgraph_cmd="queue.pl --mem 4G"
 
 #export cuda_cmd="..."
 
diff --git a/egs/csj/s5/local/csj_run_rnnlm.sh b/egs/csj/s5/local/csj_run_rnnlm.sh
index 5c6cd4343f6..e02f19bb680 100755
--- a/egs/csj/s5/local/csj_run_rnnlm.sh
+++ b/egs/csj/s5/local/csj_run_rnnlm.sh
@@ -3,7 +3,7 @@
 # Copyright  2016 Tokyo Institute of Technology (Authors: Tomohiro Tanaka, Takafumi Moriya and Takahiro Shinozaki)
 #            2016 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
 # Apache 2.0
-# Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055. 
+# Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055.
 
 [ -f ./path.sh ] && . ./path.sh
 . utils/parse_options.sh
@@ -21,7 +21,7 @@ echo h30 Begin
 local/csj_train_rnnlms.sh --dict-suffix "_nosp" data/local/rnnlm.h30
 sleep 20; # wait till tools compiled.
 
-echo h100 Begin 
+echo h100 Begin
 local/csj_train_rnnlms.sh --dict-suffix "_nosp" \
     --hidden 100 --nwords 10000 --class 200 \
     --direct 0 data/local/rnnlm.h100
@@ -60,9 +60,9 @@ for dict in rnnlm.h30 rnnlm.h100 rnnlm.h200 rnnlm.h300 rnnlm.h400 rnnlm.h500 ;do
 
       echo "rnnlm0.5"
       steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver \
-        --N 100 --cmd "queue -l mem_free=1G" --inv-acwt $acwt 0.5 \
+        --N 100 --cmd "$decode_cmd --mem 1G" --inv-acwt $acwt 0.5 \
         data/lang_csj_tg $dir data/$eval_num $sourcedir ${resultsdir}_L0.5
-      
+
       rm -rf ${resultsdir}_L0.25
       rm -rf ${resultsdir}_L0.75
       cp -rp ${resultsdir}_L0.5 ${resultsdir}_L0.25
@@ -70,12 +70,12 @@ for dict in rnnlm.h30 rnnlm.h100 rnnlm.h200 rnnlm.h300 rnnlm.h400 rnnlm.h500 ;do
 
       echo "rnnlm0.25"
       steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver \
-        --stage 7 --N 100 --cmd "$decode_cmd -l mem_free=1G" --inv-acwt $acwt 0.25 \
+        --stage 7 --N 100 --cmd "$decode_cmd --mem 1G" --inv-acwt $acwt 0.25 \
         data/lang_csj_tg $dir data/$eval_num $sourcedir ${resultsdir}_L0.25
 
       echo "rnnlm0.75"
       steps/rnnlmrescore.sh --rnnlm_ver $rnnlm_ver \
-        --stage 7 --N 100 --cmd "$decode_cmd -l mem_free=1G" --inv-acwt $acwt 0.75 \
+        --stage 7 --N 100 --cmd "$decode_cmd --mem 1G" --inv-acwt $acwt 0.75 \
         data/lang_csj_tg $dir data/$eval_num $sourcedir ${resultsdir}_L0.75
   done
 done
diff --git a/egs/csj/s5/local/nnet/run_lstm.sh b/egs/csj/s5/local/nnet/run_lstm.sh
index 3cc330c55a8..dc0f40dec24 100755
--- a/egs/csj/s5/local/nnet/run_lstm.sh
+++ b/egs/csj/s5/local/nnet/run_lstm.sh
@@ -34,10 +34,10 @@ stage=0
         steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 10 $dir $dir/log $dir/data || exit 1;
         steps/compute_cmvn_stats.sh $dir $dir/log $dir/data || exit 1;
     done
-    
+
   # Training set
   utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
-  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
+  steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd --max-jobs-run 10" \
      $train $train/log $train/data || exit 1;
   steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
   # Split the training set
diff --git a/egs/fame/README.txt b/egs/fame/README.txt
new file mode 100644
index 00000000000..d2ed39eef75
--- /dev/null
+++ b/egs/fame/README.txt
@@ -0,0 +1,15 @@
+The FAME! Speech Corpus
+
+The components of the Frisian data collection are speech and language resources gathered for building a large vocabulary ASR system for the Frisian language. Firstly, a new broadcast database is created by collecting recordings from the archives of the regional broadcaster Omrop Fryslân, and annotating them with various information such as the language switches and speaker details. The second component of this collection is a language model created on a text corpus with diverse vocabulary. Thirdly, a Frisian phonetic dictionary with the mappings between the Frisian words and phones is built to make the ASR viable for this under-resourced language. Finally, an ASR recipe is provided which uses all previous resources to perform recognition and present the recognition performances.
+
+The Corpus consists of short utterances extracted from 203 audio segments of approximately 5 minutes long which are parts of various radio programs covering a time span of almost 50 years (1966-2015), adding a longitudinal dimension to the database. The content of the recordings are very diverse including radio programs about culture, history, literature, sports, nature, agriculture, politics, society and languages. The total duration of the manually annotated radio broadcasts sums up to 18 hours, 33 minutes and 57 seconds. The stereo audio data has a sampling frequency of 48 kHz and 16-bit resolution per sample. The available meta-information helped the annotators to identify these speakers and mark them either using their names or the same label (if the name is not known). There are 309 identified speakers in the FAME! Speech Corpus, 21 of whom appear at least 3 times in the database. These speakers are mostly program presenters and celebrities appearing multiple times in different recordings over years. There are 233 unidentified speakers due to lack of meta-information. The total number of word- and sentence-level code-switching cases in the FAME! Speech Corpus is equal to 3837. Music portions have been removed, except where these overlap with speech.
+
+A full description of the FAME! Speech Corpus is provided in:
+
+Yilmaz, E., Heuvel, H. van den, Van de Velde, H., Kampstra, F., Algra, J., Leeuwen, D. van:
+
+Open Source Speech and Language Resources for Frisian Language.
+
+In: Proceedings Interspeech 2016, pp. 1536--1540, 8-12 September 2016, San Francisco
+
+Please check http://www.ru.nl/clst/datasets/ to get the FAME! Speech Corpus
diff --git a/egs/fame/s5/RESULTS b/egs/fame/s5/RESULTS
new file mode 100644
index 00000000000..a8541fba6b5
--- /dev/null
+++ b/egs/fame/s5/RESULTS
@@ -0,0 +1,28 @@
+%WER 41.10 [ 4974 / 12101, 522 ins, 1223 del, 3229 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_devel/wer_11_0.0
+%WER 38.10 [ 4909 / 12886, 527 ins, 1220 del, 3162 sub ] exp/dnn4b_pretrain-dbn_dnn/decode_test/wer_11_0.0
+%WER 41.06 [ 4969 / 12101, 514 ins, 1277 del, 3178 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it1/wer_11_0.0
+%WER 40.38 [ 4886 / 12101, 515 ins, 1225 del, 3146 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it3/wer_11_0.0
+%WER 40.15 [ 4859 / 12101, 514 ins, 1177 del, 3168 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_devel_it6/wer_10_0.5
+%WER 37.86 [ 4879 / 12886, 596 ins, 1083 del, 3200 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it1/wer_10_0.0
+%WER 37.16 [ 4789 / 12886, 592 ins, 1056 del, 3141 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it3/wer_10_0.0
+%WER 36.92 [ 4757 / 12886, 618 ins, 1010 del, 3129 sub ] exp/dnn4b_pretrain-dbn_dnn_smbr/decode_test_it6/wer_10_0.0
+%WER 42.38 [ 5129 / 12101, 576 ins, 1171 del, 3382 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode_devel/wer_11_0.0
+%WER 39.14 [ 5043 / 12886, 536 ins, 1172 del, 3335 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn/decode_test/wer_11_0.0
+%WER 42.05 [ 5088 / 12101, 525 ins, 1282 del, 3281 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it1/wer_11_0.0
+%WER 41.41 [ 5011 / 12101, 461 ins, 1345 del, 3205 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it3/wer_11_0.5
+%WER 40.97 [ 4958 / 12101, 485 ins, 1279 del, 3194 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_devel_it6/wer_11_0.5
+%WER 38.79 [ 4998 / 12886, 512 ins, 1194 del, 3292 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it1/wer_11_0.0
+%WER 38.16 [ 4917 / 12886, 544 ins, 1128 del, 3245 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it3/wer_11_0.0
+%WER 37.68 [ 4856 / 12886, 564 ins, 1068 del, 3224 sub ] exp/dnn4d-fbank_pretrain-dbn_dnn_smbr/decode_test_it6/wer_11_0.0
+%WER 70.85 [ 8574 / 12101, 414 ins, 2596 del, 5564 sub ] exp/mono/decode_devel/wer_9_0.0
+%WER 68.17 [ 8785 / 12886, 413 ins, 2704 del, 5668 sub ] exp/mono/decode_test/wer_9_0.0
+%WER 44.05 [ 5330 / 12101, 560 ins, 1467 del, 3303 sub ] exp/sgmm2/decode_devel/wer_10_0.0
+%WER 40.22 [ 5183 / 12886, 680 ins, 1142 del, 3361 sub ] exp/sgmm2/decode_test/wer_9_0.0
+%WER 54.39 [ 6582 / 12101, 695 ins, 1595 del, 4292 sub ] exp/tri1/decode_devel/wer_10_0.0
+%WER 51.60 [ 6649 / 12886, 630 ins, 1706 del, 4313 sub ] exp/tri1/decode_test/wer_11_0.0
+%WER 51.53 [ 6236 / 12101, 659 ins, 1675 del, 3902 sub ] exp/tri2/decode_devel/wer_11_0.0
+%WER 48.32 [ 6226 / 12886, 643 ins, 1669 del, 3914 sub ] exp/tri2/decode_test/wer_12_0.0
+%WER 47.15 [ 5706 / 12101, 580 ins, 1537 del, 3589 sub ] exp/tri3/decode_devel/wer_13_0.0
+%WER 52.13 [ 6308 / 12101, 623 ins, 1706 del, 3979 sub ] exp/tri3/decode_devel.si/wer_11_0.5
+%WER 43.71 [ 5632 / 12886, 594 ins, 1538 del, 3500 sub ] exp/tri3/decode_test/wer_14_0.0
+%WER 48.21 [ 6212 / 12886, 825 ins, 1358 del, 4029 sub ] exp/tri3/decode_test.si/wer_10_0.0
diff --git a/egs/fame/s5/cmd.sh b/egs/fame/s5/cmd.sh
new file mode 120000
index 00000000000..19f7e836644
--- /dev/null
+++ b/egs/fame/s5/cmd.sh
@@ -0,0 +1 @@
+../../wsj/s5/cmd.sh
\ No newline at end of file
diff --git a/egs/fame/s5/conf/decode_dnn.config b/egs/fame/s5/conf/decode_dnn.config
new file mode 100644
index 00000000000..89dd9929a62
--- /dev/null
+++ b/egs/fame/s5/conf/decode_dnn.config
@@ -0,0 +1,2 @@
+beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
+lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/egs/fame/s5/conf/fbank.conf b/egs/fame/s5/conf/fbank.conf
new file mode 100644
index 00000000000..c4b73674cab
--- /dev/null
+++ b/egs/fame/s5/conf/fbank.conf
@@ -0,0 +1,2 @@
+# No non-default options for now.
+
diff --git a/egs/fame/s5/conf/mfcc.conf b/egs/fame/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/fame/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/fame/s5/conf/mfcc_hires.conf b/egs/fame/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..434834a6725
--- /dev/null
+++ b/egs/fame/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/fame/s5/conf/online_cmvn.conf b/egs/fame/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..cbdaf5f281c
--- /dev/null
+++ b/egs/fame/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/online/run_online_decoding_nnet2.sh
diff --git a/egs/fame/s5/local/fame_data_prep.sh b/egs/fame/s5/local/fame_data_prep.sh
new file mode 100755
index 00000000000..2c2d1e79238
--- /dev/null
+++ b/egs/fame/s5/local/fame_data_prep.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+
+# Apache 2.0
+
+corpus=$1
+set -e -o pipefail
+if [ -z "$corpus" ] ; then
+    echo >&2 "The script $0 expects one parameter -- the location of the FAME! speech database"
+    exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+    echo >&2 "The directory $corpus does not exist"
+fi
+
+echo "Preparing train, development and test data"
+mkdir -p data data/local data/train data/devel data/test
+
+for x in train devel test; do
+    echo "Copy spk2utt, utt2spk, wav.scp, text for $x"
+    cp $corpus/data/$x/text     data/$x/text    || exit 1;
+    cp $corpus/data/$x/spk2utt  data/$x/spk2utt || exit 1;
+    cp $corpus/data/$x/utt2spk  data/$x/utt2spk || exit 1;
+
+    # the corpus wav.scp contains physical paths, so we just re-generate
+    # the file again from scratchn instead of figuring out how to edit it
+    for rec in $(awk '{print $1}' $corpus/data/$x/text) ; do
+        spk=${rec%_*}
+        filename=$corpus/fame/wav/${x}/${rec:8}.wav
+        if [ ! -f "$filename" ] ; then
+            echo >&2 "The file $filename could not be found ($rec)"
+            exit 1
+        fi
+        # we might want to store physical paths as a general rule
+        filename=$(readlink -f $filename)
+        echo "$rec $filename"
+    done > data/$x/wav.scp
+
+    # fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp,
+    # duplicate entries and so on). Also, it regenerates the spk2utt from
+    # utt2sp
+    utils/fix_data_dir.sh data/$x
+done
+
+echo "Copying language model"
+if [ -f $corpus/lm/LM_FR_IKN3G ] ; then
+    gzip -c $corpus/lm/LM_FR_IKN3G > data/local/LM.gz
+fi
+
+echo "Data preparation completed."
+
diff --git a/egs/fame/s5/local/fame_dict_prep.sh b/egs/fame/s5/local/fame_dict_prep.sh
new file mode 100755
index 00000000000..c6530217a67
--- /dev/null
+++ b/egs/fame/s5/local/fame_dict_prep.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright 2015-2016  Sarah Flora Juan
+# Copyright 2016  Johns Hopkins University (Author: Yenda Trmal)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+
+# Apache 2.0
+
+corpus=$1
+if [ -z "$corpus" ] ; then
+    echo >&2 "The script $0 expects one parameter -- the location of the Iban corpus"
+    exit 1
+fi
+if [ ! -d "$corpus" ] ; then
+    echo >&2 "The directory $corpus does not exist"
+fi
+
+mkdir -p data/lang data/local/dict
+
+
+cat $corpus/lexicon/lex.asr $corpus/lexicon/lex.oov > data/local/dict/lexicon.txt
+echo "!SIL	SIL" >> data/local/dict/lexicon.txt
+echo "<UNK>	SPN" >> data/local/dict/lexicon.txt
+env LC_ALL=C sort -u -o data/local/dict/lexicon.txt data/local/dict/lexicon.txt
+cat data/local/dict/lexicon.txt | \
+    perl -ane 'print join("\n", @F[1..$#F]) . "\n"; '  | \
+    sort -u | grep -v 'SIL' > data/local/dict/nonsilence_phones.txt
+
+
+touch data/local/dict/extra_questions.txt
+touch data/local/dict/optional_silence.txt
+
+echo "SIL"   > data/local/dict/optional_silence.txt
+echo "SIL"   > data/local/dict/silence_phones.txt
+echo "<UNK>" > data/local/dict/oov.txt
+
+echo "Dictionary preparation succeeded"
diff --git a/egs/fame/s5/local/nnet/run_dnn.sh b/egs/fame/s5/local/nnet/run_dnn.sh
new file mode 100755
index 00000000000..ca1efa5e0ac
--- /dev/null
+++ b/egs/fame/s5/local/nnet/run_dnn.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+# Apache 2.0
+
+# This example script trains a DNN on top of fMLLR features. 
+# The training is done in 3 stages,
+#
+# 1) RBM pre-training:
+#    in this unsupervised stage we train stack of RBMs, 
+#    a good starting point for frame cross-entropy trainig.
+# 2) frame cross-entropy training:
+#    the objective is to classify frames to correct pdfs.
+# 3) sequence-training optimizing sMBR: 
+#    the objective is to emphasize state-sequences with better 
+#    frame accuracy w.r.t. reference alignment.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+set -eu
+
+# Config:
+gmm=exp/tri3
+data_fmllr=data-fmllr-tri3
+stage=0 # resume training with --stage=N
+# End of config.
+. utils/parse_options.sh
+#
+
+[ ! -e $data_fmllr/test ] && if [ $stage -le 0 ]; then
+  # Store fMLLR features, so we can train on them easily,
+  # devel
+  dir=$data_fmllr/devel
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+     --transform-dir $gmm/decode_devel \
+     $dir data/devel $gmm $dir/log $dir/data
+  # test
+  dir=$data_fmllr/test
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+     --transform-dir $gmm/decode_test \
+     $dir data/test $gmm $dir/log $dir/data
+  # train
+  dir=$data_fmllr/train
+  steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
+     --transform-dir ${gmm}_ali \
+     $dir data/train $gmm $dir/log $dir/data
+  # split the data : 90% train 10% cross-validation (held-out)
+  utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10
+fi
+
+if [ $stage -le 1 ]; then
+  # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
+  dir=exp/dnn4b_pretrain-dbn
+  $cuda_cmd $dir/log/pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh --hid-dim 2048 --rbm-iter 10 $data_fmllr/train $dir
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/dnn4b_pretrain-dbn_dnn
+  ali=${gmm}_ali
+  feature_transform=exp/dnn4b_pretrain-dbn/final.feature_transform
+  dbn=exp/dnn4b_pretrain-dbn/6.dbn
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    $data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $data_fmllr/devel $dir/decode_devel
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $data_fmllr/test $dir/decode_test
+fi
+
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+dir=exp/dnn4b_pretrain-dbn_dnn_smbr
+srcdir=exp/dnn4b_pretrain-dbn_dnn
+acwt=0.1
+
+if [ $stage -le 3 ]; then
+  # First we generate lattices and alignments:
+  steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
+    $data_fmllr/train data/lang $srcdir ${srcdir}_ali
+  steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $data_fmllr/train data/lang $srcdir ${srcdir}_denlats
+fi
+
+if [ $stage -le 4 ]; then
+  # Re-train the DNN by 6 iterations of sMBR 
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+    $data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir
+  # Decode
+  for ITER in 6 3 1; do
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $data_fmllr/devel $dir/decode_devel_it${ITER}
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $data_fmllr/test $dir/decode_test_it${ITER}
+  done 
+fi
+
+echo Success
+exit 0
+
+# Getting results [see RESULTS file]
+# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
+
+# to see how model conversion to nnet2 works, run run_dnn_convert_nnet2.sh at this point.
+
diff --git a/egs/fame/s5/local/nnet/run_dnn_fbank.sh b/egs/fame/s5/local/nnet/run_dnn_fbank.sh
new file mode 100755
index 00000000000..a81449ffbcf
--- /dev/null
+++ b/egs/fame/s5/local/nnet/run_dnn_fbank.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+
+# Copyright 2012-2014  Brno University of Technology (Author: Karel Vesely)
+# Copyright 2016  Radboud University (Author: Emre Yilmaz)
+# Apache 2.0
+
+# This example script trains a DNN on top of FBANK features. 
+# The training is done in 3 stages,
+#
+# 1) RBM pre-training:
+#    in this unsupervised stage we train stack of RBMs, 
+#    a good starting point for frame cross-entropy trainig.
+# 2) frame cross-entropy training:
+#    the objective is to classify frames to correct pdfs.
+# 3) sequence-training optimizing sMBR: 
+#    the objective is to emphasize state-sequences with better 
+#    frame accuracy w.r.t. reference alignment.
+
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+
+. ./path.sh ## Source the tools/utils (import the queue.pl)
+
+dev=data-fbank/devel
+tst=data-fbank/test
+train=data-fbank/train
+
+dev_original=data/devel
+tst_original=data/test
+train_original=data/train
+
+gmm=exp/tri3
+
+stage=0
+. utils/parse_options.sh || exit 1;
+
+set -eu
+
+# Make the FBANK features
+[ ! -e $dev ] && if [ $stage -le 0 ]; then
+  # Dev set
+  utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $dev $dev/log $dev/data || exit 1;
+  steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
+  # Test set
+  utils/copy_data_dir.sh $tst_original $tst || exit 1; rm $tst/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $tst $tst/log $tst/data || exit 1;
+  steps/compute_cmvn_stats.sh $tst $tst/log $tst/data || exit 1;
+  # Training set
+  utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
+  steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
+     $train $train/log $train/data || exit 1;
+  steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
+  # Split the training set
+  utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
+fi
+
+if [ $stage -le 1 ]; then
+  # Pre-train DBN, i.e. a stack of RBMs (small database, smaller DNN)
+  dir=exp/dnn4d-fbank_pretrain-dbn
+  $cuda_cmd $dir/log/pretrain_dbn.log \
+    steps/nnet/pretrain_dbn.sh \
+      --cmvn-opts "--norm-means=true --norm-vars=true" \
+      --delta-opts "--delta-order=2" --splice 5 \
+      --hid-dim 2048 --rbm-iter 10 $train $dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Train the DNN optimizing per-frame cross-entropy.
+  dir=exp/dnn4d-fbank_pretrain-dbn_dnn
+  ali=${gmm}_ali
+  feature_transform=exp/dnn4d-fbank_pretrain-dbn/final.feature_transform
+  dbn=exp/dnn4d-fbank_pretrain-dbn/6.dbn
+  # Train
+  $cuda_cmd $dir/log/train_nnet.log \
+    steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
+    ${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
+  # Decode (reuse HCLG graph)
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $dev $dir/decode_devel || exit 1;
+  steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
+    $gmm/graph $tst $dir/decode_test || exit 1;
+fi
+
+
+# Sequence training using sMBR criterion, we do Stochastic-GD with per-utterance updates.
+# Note: With DNNs in RM, the optimal LMWT is 2-6. Don't be tempted to try acwt's like 0.2, 
+# the value 0.1 is better both for decoding and sMBR.
+dir=exp/dnn4d-fbank_pretrain-dbn_dnn_smbr
+srcdir=exp/dnn4d-fbank_pretrain-dbn_dnn
+acwt=0.1
+
+if [ $stage -le 3 ]; then
+  # First we generate lattices and alignments:
+  steps/nnet/align.sh --nj 20 --cmd "$train_cmd" \
+    $train data/lang $srcdir ${srcdir}_ali || exit 1;
+  steps/nnet/make_denlats.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
+    $train data/lang $srcdir ${srcdir}_denlats || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # Re-train the DNN by 6 iterations of sMBR 
+  steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt $acwt --do-smbr true \
+    $train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
+  # Decode
+  for ITER in 6 3 1; do
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $dev $dir/decode_devel_it${ITER} || exit 1
+    steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
+      --nnet $dir/${ITER}.nnet --acwt $acwt \
+      $gmm/graph $tst $dir/decode_test_it${ITER} || exit 1
+  done 
+fi
+
+echo Success
+exit 0
+
+# Getting results [see RESULTS file]
+# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/fame/s5/local/score.sh b/egs/fame/s5/local/score.sh
new file mode 120000
index 00000000000..0afefc3158c
--- /dev/null
+++ b/egs/fame/s5/local/score.sh
@@ -0,0 +1 @@
+../steps/score_kaldi.sh
\ No newline at end of file
diff --git a/egs/fame/s5/local/wer_hyp_filter b/egs/fame/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..372d1a9c73a
--- /dev/null
+++ b/egs/fame/s5/local/wer_hyp_filter
@@ -0,0 +1,2 @@
+#!/bin/sed -f
+s:<UNK>::g
diff --git a/egs/fame/s5/local/wer_output_filter b/egs/fame/s5/local/wer_output_filter
new file mode 100755
index 00000000000..372d1a9c73a
--- /dev/null
+++ b/egs/fame/s5/local/wer_output_filter
@@ -0,0 +1,2 @@
+#!/bin/sed -f
+s:<UNK>::g
diff --git a/egs/fame/s5/local/wer_ref_filter b/egs/fame/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..372d1a9c73a
--- /dev/null
+++ b/egs/fame/s5/local/wer_ref_filter
@@ -0,0 +1,2 @@
+#!/bin/sed -f
+s:<UNK>::g
diff --git a/egs/fame/s5/path.sh b/egs/fame/s5/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/fame/s5/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/fame/s5/run.sh b/egs/fame/s5/run.sh
new file mode 100755
index 00000000000..26a8485ff7d
--- /dev/null
+++ b/egs/fame/s5/run.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+stage=0
+feat_nj=10
+train_nj=10
+decode_nj=10
+famecorpus=./corpus
+
+if [ -d $famecorpus ] ; then
+  echo "Fame corpus present. OK."
+elif [ -f ./fame.tar.gz ] ; then
+  echo "Unpacking..."
+  tar xzf fame.tar.gz
+elif [ ! -d $famecorpus ] && [ ! -f ./fame.tar.gz ] ; then
+  echo "The Fame! corpus is not present. Please register here: http://www.ru.nl/clst/datasets/ "
+  echo " and download the corpus and put it at $famecorpus" && exit 1
+fi
+
+numLeavesTri1=5000
+numGaussTri1=25000
+numLeavesMLLT=5000
+numGaussMLLT=25000
+numLeavesSAT=5000
+numGaussSAT=25000
+numGaussUBM=800
+numLeavesSGMM=10000
+numGaussSGMM=20000
+
+if [ $stage -le 1 ]; then
+  local/fame_data_prep.sh $famecorpus || exit 1;
+  local/fame_dict_prep.sh $famecorpus || exit 1;
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang || exit 1;
+  utils/format_lm.sh data/lang data/local/LM.gz data/local/dict/lexicon.txt data/lang_test || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Feature extraction
+  for x in train devel test; do
+      steps/make_mfcc.sh --nj $feat_nj --cmd "$train_cmd" data/$x exp/make_mfcc/$x mfcc || exit 1;
+      steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc || exit 1;
+  done
+fi
+
+if [ $stage -le 3 ]; then
+  ### Monophone
+  echo "Starting monophone training."
+  steps/train_mono.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;
+  echo "Mono training done."
+
+  echo "Decoding the development and test sets using monophone models."
+  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/mono/graph data/devel exp/mono/decode_devel || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/mono/graph data/test exp/mono/decode_test || exit 1;
+  echo "Monophone decoding done."
+fi
+
+
+if [ $stage -le 4 ]; then
+  ### Triphone
+  echo "Starting triphone training."
+  steps/align_si.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/mono exp/mono_ali || exit 1;
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd"  $numLeavesTri1 $numGaussTri1 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
+  echo "Triphone training done."
+
+  echo "Decoding the development and test sets using triphone models."
+  utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri1/graph data/devel exp/tri1/decode_devel || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri1/graph data/test exp/tri1/decode_test || exit 1;
+  echo "Triphone decoding done."
+fi
+
+if [ $stage -le 5 ]; then
+  ### Triphone + LDA and MLLT
+  echo "Starting LDA+MLLT training."
+  steps/align_si.sh  --nj $train_nj --cmd "$train_cmd"  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+  steps/train_lda_mllt.sh  --cmd "$train_cmd"  --splice-opts "--left-context=3 --right-context=3" $numLeavesMLLT $numGaussMLLT data/train data/lang  exp/tri1_ali exp/tri2 || exit 1;
+  echo "LDA+MLLT training done."
+
+  echo "Decoding the development and test sets using LDA+MLLT models."
+  utils/mkgraph.sh data/lang_test  exp/tri2 exp/tri2/graph || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri2/graph data/devel exp/tri2/decode_devel || exit 1;
+  steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri2/graph data/test exp/tri2/decode_test || exit 1;
+  echo "LDA+MLLT decoding done."
+fi
+
+
+if [ $stage -le 6 ]; then
+  ### Triphone + LDA and MLLT + SAT and FMLLR
+  echo "Starting SAT+FMLLR training."
+  steps/align_si.sh  --nj $train_nj --cmd "$train_cmd" --use-graphs true data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
+  steps/train_sat.sh --cmd "$train_cmd" $numLeavesSAT $numGaussSAT data/train data/lang exp/tri2_ali exp/tri3 || exit 1;
+  echo "SAT+FMLLR training done."
+
+  echo "Decoding the development and test sets using SAT+FMLLR models."
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph || exit 1;
+  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri3/graph data/devel exp/tri3/decode_devel || exit 1;
+  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" exp/tri3/graph data/test exp/tri3/decode_test || exit 1;
+  echo "SAT+FMLLR decoding done."
+fi
+
+
+if [ $stage -le 7 ]; then
+  echo "Starting SGMM training."
+  steps/align_fmllr.sh --nj $train_nj --cmd "$train_cmd" data/train data/lang exp/tri3 exp/tri3_ali || exit 1;
+  steps/train_ubm.sh  --cmd "$train_cmd" $numGaussUBM data/train data/lang exp/tri3_ali exp/ubm || exit 1;
+  steps/train_sgmm2.sh  --cmd "$train_cmd" $numLeavesSGMM $numGaussSGMM data/train data/lang exp/tri3_ali exp/ubm/final.ubm exp/sgmm2 || exit 1;
+  echo "SGMM training done."
+
+  echo "Decoding the development and test sets using SGMM models"
+  utils/mkgraph.sh data/lang_test exp/sgmm2 exp/sgmm2/graph || exit 1;
+  steps/decode_sgmm2.sh --nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri3/decode_devel exp/sgmm2/graph data/devel exp/sgmm2/decode_devel || exit 1;
+  steps/decode_sgmm2.sh --nj $decode_nj --cmd "$decode_cmd" --transform-dir exp/tri3/decode_test exp/sgmm2/graph data/test exp/sgmm2/decode_test || exit 1;
+  echo "SGMM decoding done."
+fi
+
+if [ $stage -le 8 ]; then
+  echo "Starting DNN training and decoding."
+  local/nnet/run_dnn.sh || exit 1;
+  local/nnet/run_dnn_fbank.sh || exit 1;
+fi
+
+#score
+for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
diff --git a/egs/fame/s5/steps b/egs/fame/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/fame/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/fame/s5/utils b/egs/fame/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/fame/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 380a8aec936..ad650cd390e 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -256,7 +256,7 @@ steps/train_mmi_sgmm2.sh \
 
 (
 utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
-steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4" \
+steps/decode_fmllr_extra.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4" \
   --config conf/decode.config  --scoring-opts "--min-lmwt 8 --max-lmwt 12"\
  exp/tri5a/graph data/dev exp/tri5a/decode_dev
 utils/mkgraph.sh data/lang_test exp/sgmm5 exp/sgmm5/graph
@@ -274,9 +274,9 @@ done
 
 
 dnn_cpu_parallel_opts=(--minibatch-size 128 --max-change 10 --num-jobs-nnet 8 --num-threads 16 \
-                       --parallel-opts "-pe smp 16" --cmd "queue.pl -l arch=*64 --mem 2G")
+                       --parallel-opts "--num-threads 16" --cmd "queue.pl  --mem 2G")
 dnn_gpu_parallel_opts=(--minibatch-size 512 --max-change 40 --num-jobs-nnet 4 --num-threads 1 \
-                       --parallel-opts "-l gpu=1" --cmd "queue.pl -l arch=*64 --mem 2G")
+                       --parallel-opts "--gpu 1" --cmd "queue.pl  --mem 2G")
 
 steps/nnet2/train_pnorm_ensemble.sh \
   --mix-up 5000  --initial-learning-rate 0.008 --final-learning-rate 0.0008\
@@ -287,7 +287,7 @@ steps/nnet2/train_pnorm_ensemble.sh \
   data/train data/lang exp/tri5a_ali exp/tri6a_dnn
 
 (
-  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " -pe smp 4"   \
+  steps/nnet2/decode.sh --nj 13 --cmd "$decode_cmd" --num-threads 4 --parallel-opts " --num-threads 4"   \
     --scoring-opts "--min-lmwt 8 --max-lmwt 16" --transform-dir exp/tri5a/decode_dev exp/tri5a/graph data/dev exp/tri6a_dnn/decode_dev
 ) &
 wait
diff --git a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
index eae5f7b8581..210d0f5646f 100755
--- a/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/fisher_english/s5/local/nnet2/run_6c_gpu.sh
@@ -21,7 +21,7 @@ EOF
 
 
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ "$USER" == dpovey ]; then
diff --git a/egs/fisher_english/s5/local/online/run_nnet2.sh b/egs/fisher_english/s5/local/online/run_nnet2.sh
index 0b9adb7d315..de4d56bb52e 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2.sh
@@ -21,7 +21,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1" 
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_a
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_b.sh b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
index 7eac7cf0a7d..e1491a10c0b 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_b.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_b.sh
@@ -19,22 +19,22 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
-  dir=exp/nnet2_online/nnet_b_gpu 
+  dir=exp/nnet2_online/nnet_b_gpu
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_b
 fi
 
@@ -124,7 +124,7 @@ if [ $stage -le 7 ]; then
 fi
 
 if [ $stage -le 8 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
       exp/tri5a/graph data/dev ${dir}_online/decode_dev || exit 1;
@@ -161,13 +161,13 @@ exit 0;
 #%WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
 
 # The current "b" experiment, with per-utterance decoding.
-#grep WER exp/nnet2_online/nnet_b_gpu/decode_dev/wer_* | utils/best_wer.sh 
+#grep WER exp/nnet2_online/nnet_b_gpu/decode_dev/wer_* | utils/best_wer.sh
 #%WER 24.84 [ 9724 / 39141, 1446 ins, 2372 del, 5906 sub ] exp/nnet2_online/nnet_b_gpu/decode_dev/wer_10
 
 
 #The same with online decoding:
 #%WER 24.05 [ 9415 / 39141, 1413 ins, 2332 del, 5670 sub ] exp/nnet2_online/nnet_b_gpu_online/decode_dev_utt/wer_11
-grep WER exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_* | utils/best_wer.sh 
+grep WER exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_* | utils/best_wer.sh
 %WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
 
 
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh b/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
index 55b350a8faa..3b51a40a506 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_discriminative.sh
@@ -15,23 +15,23 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  gpu_opts="-l gpu=1"
-  train_parallel_opts="-l gpu=1"
+  gpu_opts="--gpu 1"
+  train_parallel_opts="--gpu 1"
   num_threads=1
   # the _a is in case I want to change the parameters.
-  srcdir=exp/nnet2_online/nnet_a_gpu 
+  srcdir=exp/nnet2_online/nnet_a_gpu
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   gpu_opts=""
   num_threads=16
-  train_parallel_opts="-pe smp 16"
+  train_parallel_opts="--num-threads 16"
   srcdir=exp/nnet2_online/nnet_a
 fi
 
@@ -45,8 +45,8 @@ if [ $stage -le 1 ]; then
   # the graph search and lattice determinization takes quite a bit of CPU.
   # note: it's the sub-split option that determinies how many jobs actually
   # run at one time.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "--num-threads 6" \
       --online-ivector-dir exp/nnet2_online/ivectors_train \
       data/train_hires data/lang $srcdir ${srcdir}_denlats
 fi
@@ -63,12 +63,12 @@ if [ $stage -le 3 ]; then
   if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
     utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/${srcdir}_smbr/degs ${srcdir}_smbr/degs/storage
   fi
-  # decreasing the learning rate by a factor of 2, due to having so much data, 
+  # decreasing the learning rate by a factor of 2, due to having so much data,
   # and decreasing the number of epochs for the same reason.
   # the io-opts option is to have more get_egs (and similar) jobs running at a time,
   # since we're using 4 disks.
   steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00001 \
-    --io-opts "-pe smp 10" \
+    --io-opts "--num-threads 10" \
     --num-epochs 4 \
     --use-preconditioning $use_preconditioning \
     --online-ivector-dir exp/nnet2_online/ivectors_train \
@@ -85,7 +85,7 @@ if [ $stage -le 4 ]; then
   done
 
   for epoch in 1 2 3 4; do
-    # do the actual online decoding with iVectors, carrying info forward from 
+    # do the actual online decoding with iVectors, carrying info forward from
     # previous utterances of the same speaker.
     steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter smbr_epoch${epoch} \
        exp/tri5a/graph data/dev ${srcdir}_online/decode_dev_smbr_epoch${epoch} || exit 1;
diff --git a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
index 47ba36f0072..c3abed82739 100755
--- a/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
+++ b/egs/fisher_english/s5/local/online/run_nnet2_multisplice.sh
@@ -21,7 +21,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1"
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
diff --git a/egs/fisher_swbd/s5/local/fisher_train_lms.sh b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
index a9e3fa4566a..64fc188cce2 100755
--- a/egs/fisher_swbd/s5/local/fisher_train_lms.sh
+++ b/egs/fisher_swbd/s5/local/fisher_train_lms.sh
@@ -6,7 +6,7 @@
 . path.sh
 
 text=data/train_all/text
-lexicon=data/local/dict_nosp/lexicon.txt 
+lexicon=data/local/dict_nosp/lexicon.txt
 
 for f in "$text" "$lexicon"; do
   [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
@@ -20,32 +20,16 @@ done
 
 dir=data/local/lm
 mkdir -p $dir
-export LC_ALL=C # You'll get errors about things being not sorted, if you
-# have a different locale.
-export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
-( # First make sure the kaldi_lm toolkit is installed.
- cd ../../../tools || exit 1;
- if [ -d kaldi_lm ]; then
-   echo Not installing the kaldi_lm toolkit since it is already there.
- else
-   echo Downloading and installing the kaldi_lm tools
-   if [ ! -f kaldi_lm.tar.gz ]; then
-     wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz ||
-     wget http://merlin.fit.vutbr.cz/kaldi/kaldi_lm.tar.gz || exit 1;
-   fi
-   tar -xvzf kaldi_lm.tar.gz || exit 1;
-   cd kaldi_lm
-   make || exit 1;
-   echo Done making the kaldi_lm tools
- fi
-) || exit 1;
-
-mkdir -p $dir
 
+kaldi_lm=`which train_lm.sh`
+if [ ! -x $kaldi_lm ]; then
+  echo "train_lm.sh is not found. Look at tools/extra/install_kaldi_lm.sh"
+  exit 1
+fi
 
 cleantext=$dir/text.no_oov
 
-cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } 
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("\n");}' \
   > $cleantext || exit 1;
 
@@ -75,7 +59,7 @@ train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
 train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1;
 
 # note: output is
-# data/local/lm/3gram-mincount/lm_unpruned.gz 
+# data/local/lm/3gram-mincount/lm_unpruned.gz
 
 
 exit 0
@@ -97,7 +81,7 @@ cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir
 
 ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
   -map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
-ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout 
+ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
 
 # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
 # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
@@ -106,7 +90,7 @@ ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
 
 # Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
 # Difference in WSJ must have been due to different treatment of <unk>.
-ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout 
+ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
 
 # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
 # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
diff --git a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
index 4afa867503a..324061aa5ac 100644
--- a/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/fisher_swbd/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -8,7 +8,7 @@ set -o pipefail
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . ./cmd.sh
 
 
@@ -38,27 +38,21 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.00000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=2
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
-                              # because it does not help in some setups
-modify_learning_rates=true
-last_layer_factor=0.1
-
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -89,7 +83,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=100  
+    nj=100
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -102,8 +96,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=$(nnet3-am-info $srcdir/final.mdl | grep "^left-context:" | awk '{print $2}')
+model_right_context=$(nnet3-am-info $srcdir/final.mdl | grep "^right-context:" | awk '{print $2}')
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -113,7 +107,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -126,15 +120,12 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -147,9 +138,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri5a/graph_fsh_sw1_tg
@@ -158,8 +147,8 @@ if [ $stage -le 5 ]; then
     for decode_set in eval2000 rt03; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
-      
+      iter=epoch${x}_adj
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_fsh_sw1_tg_$iter ;
@@ -181,4 +170,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
index 158a5148fb5..4e86f08785e 100755
--- a/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
+++ b/egs/fisher_swbd/s5/local/online/run_nnet2_ms.sh
@@ -3,7 +3,7 @@
 . cmd.sh
 
 
-stage=6
+stage=0
 train_stage=451
 use_gpu=true
 rescore=true
@@ -16,13 +16,13 @@ set -e
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1 -q g.q" 
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
@@ -37,7 +37,7 @@ if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
     utils/create_split_dir.pl /export/b0{6,7,8,9}/${USER}/kaldi-dsata/egs/fisher_swbd/s5/$dir/egs/storage $dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
   # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
@@ -73,12 +73,12 @@ fi
 
 if [ $stage -le 8 ]; then
   for test in eval2000 rt03; do
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
      steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
         exp/tri5a/graph_fsh_sw1_tg data/$test ${dir}_online/decode_${test}_fsh_sw1_tg || exit 1;
-  
-  # rescore 
+
+  # rescore
     if [ $rescore ]; then
          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
            data/lang_fsh_sw1_{tg,fg} data/${test} \
@@ -88,14 +88,14 @@ if [ $stage -le 8 ]; then
 fi
 
 if [ $stage -le 9 ]; then
-  for test in eval2000 rt03; do  
+  for test in eval2000 rt03; do
   # this version of the decoding treats each utterance separately
   # without carrying forward speaker information.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
      --per-utt true \
       exp/tri5a/graph_fsh_sw1_tg data/$test ${dir}_online/decode_${test}_utt_fsh_sw1_tg || exit 1;
-  
-  
+
+
   # rescore
     if [ $rescore ]; then
          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
diff --git a/egs/fisher_swbd/s5/local/rt03_data_prep.sh b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
index a18637a6a16..d565b2b4b1a 100755
--- a/egs/fisher_swbd/s5/local/rt03_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/rt03_data_prep.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# RT-03 data preparation (conversational telephone speech part only) 
+# RT-03 data preparation (conversational telephone speech part only)
 # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 
 # To be run from one directory above this script.
@@ -8,7 +8,8 @@
 # Expects the standard directory layout for RT-03
 
 if [ $# -ne 1 ]; then
-  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "Usage: $0 <rt03-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
   echo "See comments in the script for more details"
   exit 1
 fi
@@ -19,7 +20,7 @@ sdir=$1
 [ ! -d $sdir/data/references/eval03/english/cts ] \
   && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/rt03
 mkdir -p $dir
@@ -37,7 +38,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -47,7 +48,7 @@ awk -v sph2pipe=$sph2pipe '{
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 #pem=$sdir/english/hub5e_00.pem
 #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
@@ -59,7 +60,7 @@ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | sort -u > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -77,7 +78,7 @@ cat $tdir/*.stm | \
   grep -v inter_segment_gap | \
   awk '{
            printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
-  > $dir/stm  
+  > $dir/stm
 #$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
 cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 
@@ -87,10 +88,10 @@ cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -110,4 +111,3 @@ done
 
 echo Data preparation and formatting completed for RT-03
 echo "(but not MFCC extraction)"
-
diff --git a/egs/fisher_swbd/s5/path.sh b/egs/fisher_swbd/s5/path.sh
index e14c6074f6b..2d17b17a84a 100755
--- a/egs/fisher_swbd/s5/path.sh
+++ b/egs/fisher_swbd/s5/path.sh
@@ -1,6 +1,6 @@
-export KALDI_ROOT=`pwd`/../../../
-export PWD=`pwd`
-export PATH=$KALDI_ROOT/src/ivectorbin:$PWD/stanford-utils:$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$KALDI_ROOT/tools/kaldi_lm:$KALDI_ROOT/tools/srilm/bin:$KALDI_ROOT/tools/srilm/bin/i686-m64:$PATH
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/fisher_swbd/s5/run.sh b/egs/fisher_swbd/s5/run.sh
index 5addefc5fe1..8b1af972647 100755
--- a/egs/fisher_swbd/s5/run.sh
+++ b/egs/fisher_swbd/s5/run.sh
@@ -7,6 +7,7 @@
 mfccdir=mfcc
 set -e
 rescore=true
+
 # prepare fisher data and put it under data/train_fisher
 local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
    /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
@@ -40,7 +41,7 @@ for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
 done
 
 # LM for train_all
-local/fisher_train_lms.sh 
+local/fisher_train_lms.sh
 #local/fisher_create_test_lang.sh
 # Compiles G for trigram LM
 LM=data/local/lm/3gram-mincount/lm_unpruned.gz
@@ -58,7 +59,7 @@ fi
 
 #local/eval2000_data_prep.sh /scail/group/deeplearning/speech/datasets/LDC2002S09/hub5e_00/ /scail/group/deeplearning/speech/datasets/LDC2002T43 || exit 1
 local/eval2000_data_prep.sh /export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43 || exit 1
- 
+
 #local/rt03_data_prep.sh /scail/group/deeplearning/speech/datasets/rt_03 || exit 1
 local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10 || exit 1
 
@@ -66,6 +67,12 @@ utils/fix_data_dir.sh data/train_all
 
 
 # Make MFCCs for the training set
+# spread the mfccs over various machines, as this data-set is quite large.
+if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
+  mfcc=$(basename $mfccdir) # in case was absolute pathname (unlikely), get basename.
+  utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/fisher_swbd/s5/$mfcc/storage \
+    $mfccdir/storage
+fi
 steps/make_mfcc.sh --nj 100 --cmd "$train_cmd" data/train_all exp/make_mfcc/train_all $mfccdir || exit 1;
 utils/fix_data_dir.sh data/train_all
 utils/validate_data_dir.sh data/train_all
@@ -111,31 +118,31 @@ utils/data/remove_dup_utts.sh 200 data/train_30k data/train_30k_nodup
 utils/data/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup
 utils/data/remove_dup_utts.sh 300 data/train data/train_nodup
 
-# The next commands are not necessary for the scripts to run, but increase 
-# efficiency of data access by putting the mfcc's of the subset 
+# The next commands are not necessary for the scripts to run, but increase
+# efficiency of data access by putting the mfcc's of the subset
 # in a contiguous place in a file.
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_10k_nodup/feats.scp{,.bak} 
+  cp data/train_10k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_10k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_10k_nodup.ark,$mfccdir/kaldi_fish_10k_nodup.scp \
   && cp $mfccdir/kaldi_fish_10k_nodup.scp data/train_10k_nodup/feats.scp
 )
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_30k_nodup/feats.scp{,.bak} 
+  cp data/train_30k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_30k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_30k_nodup.ark,$mfccdir/kaldi_fish_30k_nodup.scp \
   && cp $mfccdir/kaldi_fish_30k_nodup.scp data/train_30k_nodup/feats.scp
 )
-( . path.sh; 
+( . path.sh;
   # make sure mfccdir is defined as above..
-  cp data/train_100k_nodup/feats.scp{,.bak} 
+  cp data/train_100k_nodup/feats.scp{,.bak}
   copy-feats scp:data/train_100k_nodup/feats.scp  ark,scp:$mfccdir/kaldi_fish_100k_nodup.ark,$mfccdir/kaldi_fish_100k_nodup.scp \
   && cp $mfccdir/kaldi_fish_100k_nodup.scp data/train_100k_nodup/feats.scp
 )
 
 # Start training on the Switchboard subset, which has cleaner alignments
 steps/train_mono.sh --nj 3 --cmd "$train_cmd" \
-  data/train_10k_nodup data/lang_nosp exp/mono0a 
+  data/train_10k_nodup data/lang_nosp exp/mono0a
 
 steps/align_si.sh --nj 10 --cmd "$train_cmd" \
    data/train_30k_nodup data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
@@ -171,8 +178,8 @@ steps/align_si.sh --nj 50 --cmd "$train_cmd" \
 steps/train_deltas.sh --cmd "$train_cmd" \
     5500 90000 data/train_100k_nodup data/lang_nosp exp/tri1b_ali exp/tri2 || exit 1;
  #used to be 2500 20000 on 30k
-( 
-  graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg 
+(
+  graph_dir=exp/tri2/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri2 $graph_dir || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri2/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
@@ -180,7 +187,7 @@ steps/train_deltas.sh --cmd "$train_cmd" \
    $graph_dir data/rt03 exp/tri2/decode_rt03_nosp_fsh_sw1_tg || exit 1;
 )&
 
-# Train tri3a, the last speaker-independent triphone stage, 
+# Train tri3a, the last speaker-independent triphone stage,
 # on the whole Switchboard training set
 steps/align_si.sh --nj 100 --cmd "$train_cmd" \
    data/train_swbd data/lang_nosp exp/tri2 exp/tri2_ali || exit 1;
@@ -189,8 +196,8 @@ steps/train_deltas.sh --cmd "$train_cmd" \
     11500 200000 data/train_swbd data/lang_nosp exp/tri2_ali exp/tri3a || exit 1;
  #used to be 2500 20000
 
-( 
-  graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg 
+(
+  graph_dir=exp/tri3a/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3a $graph_dir || exit 1;
   steps/decode.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri3a/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
@@ -205,8 +212,8 @@ steps/align_si.sh --nj 100 --cmd "$train_cmd" \
 steps/train_lda_mllt.sh --cmd "$train_cmd" \
    --splice-opts "--left-context=3 --right-context=3" \
    11500 400000 data/train_nodup data/lang_nosp exp/tri3a_ali exp/tri3b || exit 1;
-( 
-  graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg 
+(
+  graph_dir=exp/tri3b/graph_nosp_fsh_sw1_tg
   utils/mkgraph.sh data/lang_nosp_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri3b/decode_eval2000_nosp_fsh_sw1_tg || exit 1;
@@ -232,16 +239,16 @@ if [ $rescore ]; then
   utils/build_const_arpa_lm.sh $LM_fg data/lang data/lang_fsh_sw1_fg
 fi
 
-( 
+(
   graph_dir=exp/tri3b/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri3b $graph_dir || exit 1;
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/eval2000 exp/tri3b/decode_eval2000_fsh_sw1_tg || exit 1;
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
    $graph_dir data/rt03 exp/tri3b/decode_rt03_fsh_sw1_tg || exit 1;
-) &
+)&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
@@ -250,7 +257,7 @@ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
 steps/train_sat.sh  --cmd "$train_cmd" \
   11500 800000 data/train_nodup data/lang exp/tri3b_ali  exp/tri4a || exit 1;
 
-( 
+(
   graph_dir=exp/tri4a/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri4a $graph_dir
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -272,11 +279,10 @@ fi
 steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
   data/train_nodup data/lang exp/tri4a exp/tri4a_ali || exit 1;
 
-
 steps/train_sat.sh  --cmd "$train_cmd" \
   11500 1600000 data/train_nodup data/lang exp/tri4a_ali  exp/tri5a || exit 1;
 
-( 
+(
   graph_dir=exp/tri5a/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri5a $graph_dir
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -308,7 +314,7 @@ steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
 steps/train_sat.sh  --cmd "$train_cmd" \
   11500 3200000 data/train_nodup data/lang exp/tri5a_ali  exp/tri6a || exit 1;
 
-( 
+(
   graph_dir=exp/tri6a/graph_fsh_sw1_tg
   utils/mkgraph.sh data/lang_fsh_sw1_tg exp/tri6a $graph_dir
   steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config \
@@ -331,9 +337,6 @@ fi
 #steps/align_fmllr.sh --nj 200 --cmd "$train_cmd" \
 #  data/train_nodup data/lang exp/tri6a exp/tri6a_ali || exit 1;
 
-
-# # The following is the current online-nnet2 recipe, with "multi-splice".
+# The following is the current online-nnet2 recipe, with "multi-splice".
 # local/online/run_nnet2_ms.sh
 local/online/run_nnet2_ms.sh
-
-
diff --git a/egs/gale_arabic/s5/local/online/run_nnet2.sh b/egs/gale_arabic/s5/local/online/run_nnet2.sh
index 8ccbda5a8dc..0db62242459 100644
--- a/egs/gale_arabic/s5/local/online/run_nnet2.sh
+++ b/egs/gale_arabic/s5/local/online/run_nnet2.sh
@@ -18,23 +18,23 @@ decode_nj=30
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
-  dir=exp/nnet2_online/nnet_a_gpu 
+  dir=exp/nnet2_online/nnet_a_gpu
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_a
 fi
 
@@ -123,13 +123,13 @@ if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-online/egs/bolt/s5/$dir/egs $dir/egs/storage || exit 1
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15) to (8).
   # The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
-  
+
   steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
     --num-epochs 8 \
     --samples-per-iter 400000 \
@@ -156,7 +156,7 @@ if [ $stage -le 7 ]; then
 fi
 
 if [ $stage -le 8 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
    steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
       exp/tri3b/graph data/test ${dir}_online/decode_test || exit 1;
diff --git a/egs/hkust/s5/local/character_tokenizer b/egs/hkust/s5/local/character_tokenizer
new file mode 100755
index 00000000000..a3d8098d17f
--- /dev/null
+++ b/egs/hkust/s5/local/character_tokenizer
@@ -0,0 +1,32 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print " $s";
+    } else {
+      @chars = split "", $s;
+      foreach $c (@chars) {
+        if ($c =~ /\p{InCJK_Unified_Ideographs}/) {
+          print " $c";
+        } else {
+          print "$c";
+        }
+      }
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/hkust/s5/local/hkust_data_prep.sh b/egs/hkust/s5/local/hkust_data_prep.sh
index 07f3c9677d8..207f03af36b 100755
--- a/egs/hkust/s5/local/hkust_data_prep.sh
+++ b/egs/hkust/s5/local/hkust_data_prep.sh
@@ -104,8 +104,8 @@ awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];e
    print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
 awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
 
-sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
-[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
+sph2pipe=`which sph2pipe` || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+[ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
 
 cat $train_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
     printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
@@ -136,5 +136,4 @@ cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir
 cat $dev_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
 
 echo "$0: HKUST data preparation succeeded"
-
-exit;
+exit 0
diff --git a/egs/hkust/s5/local/hkust_prepare_dict.sh b/egs/hkust/s5/local/hkust_prepare_dict.sh
index 5cd864c52cc..6aca37586ed 100755
--- a/egs/hkust/s5/local/hkust_prepare_dict.sh
+++ b/egs/hkust/s5/local/hkust_prepare_dict.sh
@@ -312,5 +312,4 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ",
  cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
 
 echo "$0: HKUST dict preparation succeeded"
-
-exit;
+exit 0;
diff --git a/egs/hkust/s5/local/hkust_train_lms.sh b/egs/hkust/s5/local/hkust_train_lms.sh
index d6d0b2aa0bc..8520bb26d2d 100755
--- a/egs/hkust/s5/local/hkust_train_lms.sh
+++ b/egs/hkust/s5/local/hkust_train_lms.sh
@@ -19,9 +19,13 @@ done
 dir=data/local/lm
 mkdir -p $dir
 
+export LC_ALL=C # You'll get errors about things being not sorted, if you
+                # have a different locale.
 kaldi_lm=`which train_lm.sh`
 if [ ! -x $kaldi_lm ]; then
-  echo "train_lm.sh is not found. Checkout tools/extra/install_kaldi_lm.sh"
+  echo "$0: train_lm.sh is not found. That might mean it's not installed"
+  echo "$0: or it is not added to PATH"
+  echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
   exit 1
 fi
 
diff --git a/egs/hkust/s5/local/online/run_nnet2_ms.sh b/egs/hkust/s5/local/online/run_nnet2_ms.sh
index b935d86fa90..c3177e1136e 100755
--- a/egs/hkust/s5/local/online/run_nnet2_ms.sh
+++ b/egs/hkust/s5/local/online/run_nnet2_ms.sh
@@ -20,7 +20,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads"
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # Run the common stages of training, including training the iVector extractor
diff --git a/egs/hkust/s5/local/score.sh b/egs/hkust/s5/local/score.sh
deleted file mode 120000
index df664a0f1f1..00000000000
--- a/egs/hkust/s5/local/score.sh
+++ /dev/null
@@ -1 +0,0 @@
-../steps/scoring/score_kaldi_cer.sh
\ No newline at end of file
diff --git a/egs/hkust/s5/local/score.sh b/egs/hkust/s5/local/score.sh
new file mode 100755
index 00000000000..766eaf3cd44
--- /dev/null
+++ b/egs/hkust/s5/local/score.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -e -o pipefail
+set -x
+steps/score_kaldi.sh "$@"
+steps/score_kaldi_cer.sh --stage 2 "$@"
+
+echo "$0: Done"
diff --git a/egs/hkust/s5/local/wer_output_filter b/egs/hkust/s5/local/wer_output_filter
new file mode 100755
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/hkust/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+
diff --git a/egs/iban/s5/local/prepare_lm.sh b/egs/iban/s5/local/prepare_lm.sh
index a19dc18f566..10d5e276aa3 100755
--- a/egs/iban/s5/local/prepare_lm.sh
+++ b/egs/iban/s5/local/prepare_lm.sh
@@ -10,7 +10,7 @@ set -e -o pipefail
 
 local/train_lms_srilm.sh --train-text data/train/text data/ data/srilm
 
-nl -nrz -w10  corpus/LM/iban-bp-2012.txt | sort -R > data/local/external_text
+nl -nrz -w10  corpus/LM/iban-bp-2012.txt | utils/shuffle_list.pl > data/local/external_text
 local/train_lms_srilm.sh --train-text data/local/external_text data/ data/srilm_external
 
 # let's do ngram interpolation of the previous two LMs
@@ -21,7 +21,7 @@ for w in 0.9 0.8 0.7 0.6 0.5; do
     ngram -lm data/srilm/lm.gz  -mix-lm data/srilm_external/lm.gz \
           -lambda $w -write-lm data/srilm_interp/lm.${w}.gz
     echo -n "data/srilm_interp/lm.${w}.gz "
-    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s
+    ngram -lm data/srilm_interp/lm.${w}.gz -ppl data/srilm/dev.txt | paste -s -
 done | sort  -k15,15g  > data/srilm_interp/perplexities.txt
 
 # for basic decoding, let's use only a trigram LM
diff --git a/egs/iban/s5/local/train_lms_srilm.sh b/egs/iban/s5/local/train_lms_srilm.sh
index 9ed88842650..f72596e750a 100755
--- a/egs/iban/s5/local/train_lms_srilm.sh
+++ b/egs/iban/s5/local/train_lms_srilm.sh
@@ -206,9 +206,9 @@ echo "--------------------"
 echo "Computing perplexity"
 echo "--------------------"
 (
-  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
-  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
-  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/2gram* ; do ( echo $f; ngram -order 2 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' - ; done
 )  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
 
 echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
diff --git a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
index bda883f16c2..aeb0a7164e2 100755
--- a/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/chain/run_tdnn_discriminative.sh
@@ -10,7 +10,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 
 
 stage=0
@@ -44,7 +44,6 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.000001
@@ -59,8 +58,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -95,7 +94,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
     rm ${train_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
 
     data_dirs=
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
         $x $train_data_dir exp/shift_hires mfcc_hires
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
@@ -103,7 +102,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $train_ivector_dir/ivector_online.scp >> ${train_ivector_dir}_fs/ivector_online.scp
     done
     utils/combine_data.sh ${train_data_dir}_fs $data_dirs
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       rm -r ${train_data_dir}_fs$x
     done
   fi
@@ -112,9 +111,9 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
   affix=_fs
 fi
-    
+
 rm ${train_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
-for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
   awk -v nfs=$x '{print "fs"nfs"-"$0}' $train_ivector_dir/ivector_online.scp >> ${train_ivector_dir}_fs/ivector_online.scp
 done
 train_ivector_dir=${train_ivector_dir}_fs
@@ -133,7 +132,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats${affix}
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -147,16 +146,13 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs${affix}
@@ -169,16 +165,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -191,7 +184,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" --use-frame-shift false \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --adjust-priors false \
       ${degs_dir} $dir ;
 fi
 
@@ -202,7 +195,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$[x*frame_subsampling_factor]
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
@@ -219,7 +212,7 @@ if [ $stage -le 5 ]; then
     done
   done
   wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
 fi
 
 if [ $stage -le 6 ] && $cleanup; then
@@ -231,4 +224,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/librispeech/s5/local/nnet2/run_5c.sh b/egs/librispeech/s5/local/nnet2/run_5c.sh
index bf261b93910..956a8f09348 100755
--- a/egs/librispeech/s5/local/nnet2/run_5c.sh
+++ b/egs/librispeech/s5/local/nnet2/run_5c.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 train_stage=-10
 use_gpu=true
@@ -16,8 +16,8 @@ test_sets="dev-clean dev-other"
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -28,7 +28,7 @@ EOF
   dir=exp/nnet5c_gpu_${train_set}
 else
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet5c_${train_set}
   minibatch_size=128
 fi
diff --git a/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh b/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh
index 4be618b6c6e..6c3d99828b5 100755
--- a/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh
+++ b/egs/librispeech/s5/local/nnet2/run_6a_clean_460.sh
@@ -2,7 +2,7 @@
 
 # This is p-norm neural net training, with the "fast" script, on top of adapted
 # 40-dimensional features.
-# This version uses 460 hours of "clean" (typically relatively un-accented) 
+# This version uses 460 hours of "clean" (typically relatively un-accented)
 # training data.
 # We're using 6 jobs rather than 4, for speed.
 
@@ -19,8 +19,8 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -32,7 +32,7 @@ EOF
 else
   # with just 4 jobs this might be a little slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp/nnet6a_clean_460
 fi
@@ -41,7 +41,7 @@ fi
 . utils/parse_options.sh
 
 if [ ! -f $dir/final.mdl ]; then
-  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
      # spread the egs over various machines.  will help reduce overload of any
      # one machine.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$dir/egs/storage $dir/egs/storage
diff --git a/egs/librispeech/s5/local/nnet2/run_7a_960.sh b/egs/librispeech/s5/local/nnet2/run_7a_960.sh
index 2b873a94f10..2afcc2cd633 100755
--- a/egs/librispeech/s5/local/nnet2/run_7a_960.sh
+++ b/egs/librispeech/s5/local/nnet2/run_7a_960.sh
@@ -19,8 +19,8 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -32,7 +32,7 @@ EOF
 else
   # with just 4 jobs this might be a little slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp/nnet7a_960
 fi
@@ -41,7 +41,7 @@ fi
 . utils/parse_options.sh
 
 if [ ! -f $dir/final.mdl ]; then
-  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then 
+  if [[  $(hostname -f) ==  *.clsp.jhu.edu ]]; then
      # spread the egs over various machines.  will help reduce overload of any
      # one machine.
      utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/librispeech/s5/$dir/egs/storage $dir/egs/storage
diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
index 81732779d37..b513e0908a5 100755
--- a/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/librispeech/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -10,7 +10,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 
 
 stage=0
@@ -45,27 +45,22 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.00000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
-                              # because it does not help in some setups
-modify_learning_rates=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -96,7 +91,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -109,8 +104,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -120,7 +115,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -133,15 +128,12 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $train_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -154,9 +146,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 if [ $stage -le 5 ]; then
@@ -165,8 +155,8 @@ if [ $stage -le 5 ]; then
     for decode_set in test_clean test_other dev_clean dev_other; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
-      
+      iter=epoch${x}_adj
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_tgsmall_$iter || exit 1
@@ -194,4 +184,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/librispeech/s5/local/online/run_nnet2.sh b/egs/librispeech/s5/local/online/run_nnet2.sh
index 232794d102e..2f955699e18 100755
--- a/egs/librispeech/s5/local/online/run_nnet2.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -36,7 +36,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 local/online/run_nnet2_common.sh --stage $stage
@@ -101,7 +101,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for test in test_clean test_other dev_clean dev_other; do
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
diff --git a/egs/librispeech/s5/local/online/run_nnet2_disc.sh b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
index 02e4d95831c..e60cde13b5c 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -39,19 +39,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -66,13 +66,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_960_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/librispeech/s5/local/online/run_nnet2_ms.sh b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
index e0cee59d7fc..74e8e5021d8 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_ms.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 
@@ -46,7 +46,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -119,7 +119,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for test in test_clean test_other dev_clean dev_other; do
     (
diff --git a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
index 85d6e90a534..f5b73ec4173 100755
--- a/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/librispeech/s5/local/online/run_nnet2_ms_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -40,19 +40,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -67,13 +67,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # max total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_960_hires \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_960_hires data/lang_pp $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang_pp ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh b/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh
index 62843627fab..d1b9de2d190 100755
--- a/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh
+++ b/egs/librispeech/s5/local/online_pitch/run_nnet2_ms.sh
@@ -20,13 +20,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 
@@ -45,7 +45,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -117,7 +117,7 @@ if [ $stage -le 10 ]; then
 fi
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for test in dev_clean dev_other; do
     steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
diff --git a/egs/lre/v1/lid/train_diag_ubm.sh b/egs/lre/v1/lid/train_diag_ubm.sh
index 60f2452f3b7..8ba703073c0 100755
--- a/egs/lre/v1/lid/train_diag_ubm.sh
+++ b/egs/lre/v1/lid/train_diag_ubm.sh
@@ -29,7 +29,7 @@ cleanup=true
 min_gaussian_weight=0.0001
 remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
 num_threads=32
-parallel_opts="-pe smp 32"
+parallel_opts="--num-threads 32"
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -49,7 +49,7 @@ if [ $# != 3 ]; then
   echo "  --stage <stage|-2>                               # stage to do partial re-run from."
   echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
   echo "                                                   # limit computation to, for speed"
-  echo " --subsample <n|5>                                 # In main E-M phase, use every n" 
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
   echo "                                                   # frames (a speedup)"
   echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
   echo "                                                   # for model initialization"
@@ -59,7 +59,7 @@ if [ $# != 3 ]; then
   echo "                                                   # in initialization phase (then split)"
   echo " --num-threads <n|32>                              # number of threads to use in initialization"
   echo "                                                   # phase (must match with parallel-opts option)"
-  echo " --parallel-opts <string|'-pe smp 32'>             # Option should match number of threads in"
+  echo " --parallel-opts <string|'--num-threads 32'>             # Option should match number of threads in"
   echo "                                                   # --num-threads option above"
   echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
   echo "                                                   # initialization (this relatively high"
diff --git a/egs/lre/v1/lid/train_ivector_extractor.sh b/egs/lre/v1/lid/train_ivector_extractor.sh
index 8e238985f99..18f536a60cb 100755
--- a/egs/lre/v1/lid/train_ivector_extractor.sh
+++ b/egs/lre/v1/lid/train_ivector_extractor.sh
@@ -13,7 +13,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -24,8 +24,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -84,7 +84,7 @@ nj_full=$[$nj*$num_processes]
 sdata=$data/split$nj_full;
 utils/split_data.sh $data $nj_full || exit 1;
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 
 feats="ark,s,cs:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:$sdata/JOB/feats.scp ark:- | add-deltas-sdc ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
@@ -97,7 +97,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -146,7 +146,7 @@ while [ $x -lt $num_iters ]; do
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
+	$cmd --num-threads $nt $dir/log/update.$x.log \
 	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
 	rm $dir/acc.$x.*
     if $cleanup; then
diff --git a/egs/lre/v1/run.sh b/egs/lre/v1/run.sh
index 740fad7aceb..bc0f8db572d 100755
--- a/egs/lre/v1/run.sh
+++ b/egs/lre/v1/run.sh
@@ -50,9 +50,9 @@ rm foo
 local/split_long_utts.sh --max-utt-len 120 data/train_unsplit data/train
 
 # This commented script is an alternative to the above utterance
-# splitting method. Here we split the utterance based on the number of 
+# splitting method. Here we split the utterance based on the number of
 # frames which are voiced, rather than the total number of frames.
-# max_voiced=3000 
+# max_voiced=3000
 # local/vad_split_utts.sh --max-voiced $max_voiced data/train_unsplit $mfccdir data/train
 
 use_vtln=true
@@ -61,7 +61,7 @@ if $use_vtln; then
     cp -rt data/${t} data/${t}_novtln
     rm -r data/${t}_novtln/{split,.backup,spk2warp} 2>/dev/null || true
     steps/make_mfcc.sh --mfcc-config conf/mfcc_vtln.conf --nj 100 --cmd "$train_cmd" \
-       data/${t}_novtln exp/make_mfcc $mfccdir 
+       data/${t}_novtln exp/make_mfcc $mfccdir
     lid/compute_vad_decision.sh data/${t}_novtln exp/make_mfcc $mfccdir
   done
   # Vtln-related things:
@@ -115,7 +115,7 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
 # Alternatively, a diagonal UBM can replace the full UBM used above.
 # The preceding calls to train_diag_ubm.sh and train_full_ubm.sh
 # can be commented out and replaced with the following lines.
-# 
+#
 # This results in a slight degradation but could improve error rate when
 # there is less training data than used in this example.
 #
@@ -125,12 +125,12 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd" data/train \
 #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \
 #  exp/full_ubm_2048/final.ubm
 
-lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
+lid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
   --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
   exp/extractor_2048
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/train exp/ivectors_train
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/lre07 exp/ivectors_lre07
diff --git a/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh b/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh
index 972348766b5..aeced4fb273 100755
--- a/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh
+++ b/egs/lre07/v1/lid/init_full_ubm_from_dnn.sh
@@ -12,6 +12,7 @@
 nj=40
 cmd="run.pl"
 stage=-2
+cleanup=true
 
 # End configuration section.
 
@@ -77,4 +78,11 @@ $cmd $dir/log/init.log \
   "fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
   $dir/final.ubm || exit 1;
 
+if $cleanup; then
+  echo "$0: removing stats"
+  for g in $(seq $nj); do
+    rm $dir/stats.$g.acc || exit 1
+  done
+fi
+
 exit 0;
diff --git a/egs/lre07/v1/lid/nnet2/get_egs2.sh b/egs/lre07/v1/lid/nnet2/get_egs2.sh
index 27cf82bd1a1..7806dce4894 100755
--- a/egs/lre07/v1/lid/nnet2/get_egs2.sh
+++ b/egs/lre07/v1/lid/nnet2/get_egs2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
 #                2015 David Snyder
 # Apache 2.0.
 #
@@ -54,7 +54,7 @@ transform_dir=     # If supplied, overrides alidir as the place to find fMLLR tr
 postdir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 
@@ -83,7 +83,7 @@ if [ $# != 3 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -109,7 +109,7 @@ utils/split_data.sh $data $nj
 mkdir -p $dir/log $dir/info
 cp $alidir/tree $dir
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 
@@ -129,7 +129,7 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
-## Set up features. 
+## Set up features.
 if [ -z $feat_type ]; then
   if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
 fi
@@ -140,7 +140,7 @@ case $feat_type in
     valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true  scp:- ark:- |"
     train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     # caution: the top-level nnet training script should copy these to its own dir now.
     cp $alidir/{splice_opts,final.mat} $dir || exit 1;
@@ -280,13 +280,13 @@ if [ $stage -le 3 ]; then
     egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
   done
   echo "$0: Generating training examples on disk"
-  # The examples will go round-robin to egs_list. 
+  # The examples will go round-robin to egs_list.
   if [ ! -z $postdir ]; then
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       scp:$postdir/post.JOB.scp ark:- \| \
       nnet-copy-egs ark:- $egs_list || exit 1;
-  else 
+  else
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
@@ -299,7 +299,7 @@ if [ $stage -le 4 ]; then
   # shuffle the order, writing to the egs.JOB.ark
 
   egs_list=
-  for n in $(seq $nj); do 
+  for n in $(seq $nj); do
     egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
   done
 
diff --git a/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh b/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh
index 4809f42e633..533001934ab 100755
--- a/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh
+++ b/egs/lre07/v1/lid/nnet2/train_multisplice_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -9,7 +9,7 @@
 
 # This is a modified version of train_multisplice_accel2.sh in
 # steps/nnet2/ for language recognition. The main difference is
-# that it uses different get_lda.sh and get_egs2.sh scripts. 
+# that it uses different get_lda.sh and get_egs2.sh scripts.
 #
 # The original train_multisplice_accel2.sh was a modified version of
 # train_pnorm_multisplice2.sh (still using pnorm).  The "accel" refers to the
@@ -25,11 +25,11 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -66,7 +66,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -78,11 +78,11 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
@@ -92,7 +92,7 @@ transform_dir=     # If supplied, overrides alidir
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -127,10 +127,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
   echo "                                                   # as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -148,7 +148,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -372,7 +372,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -417,7 +417,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -461,7 +461,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -500,7 +500,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -537,7 +537,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/lre07/v1/lid/train_diag_ubm.sh b/egs/lre07/v1/lid/train_diag_ubm.sh
index 60f2452f3b7..a5e256818ce 100755
--- a/egs/lre07/v1/lid/train_diag_ubm.sh
+++ b/egs/lre07/v1/lid/train_diag_ubm.sh
@@ -29,7 +29,7 @@ cleanup=true
 min_gaussian_weight=0.0001
 remove_low_count_gaussians=true # set this to false if you need #gauss to stay fixed.
 num_threads=32
-parallel_opts="-pe smp 32"
+parallel_opts="--num-threads 32"
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -49,7 +49,7 @@ if [ $# != 3 ]; then
   echo "  --stage <stage|-2>                               # stage to do partial re-run from."
   echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
   echo "                                                   # limit computation to, for speed"
-  echo " --subsample <n|5>                                 # In main E-M phase, use every n" 
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
   echo "                                                   # frames (a speedup)"
   echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
   echo "                                                   # for model initialization"
@@ -59,7 +59,7 @@ if [ $# != 3 ]; then
   echo "                                                   # in initialization phase (then split)"
   echo " --num-threads <n|32>                              # number of threads to use in initialization"
   echo "                                                   # phase (must match with parallel-opts option)"
-  echo " --parallel-opts <string|'-pe smp 32'>             # Option should match number of threads in"
+  echo " --parallel-opts <string|'--num-threads 32'>             # Option should match number of threads in"
   echo "                                                   # --num-threads option above"
   echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
   echo "                                                   # initialization (this relatively high"
@@ -129,10 +129,11 @@ for x in `seq 0 $[$num_iters-1]`; do
     $cmd $dir/log/update.$x.log \
       gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
       $dir/$[$x+1].dubm || exit 1;
-    rm $dir/$x.*.acc $dir/$x.dubm
+    $cleanup && rm $dir/$x.*.acc $dir/$x.dubm
   fi
 done
 
-rm $dir/gselect.*.gz
+$cleanup && rm $dir/gselect.*.gz
+
 mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
 exit 0;
diff --git a/egs/lre07/v1/lid/train_ivector_extractor.sh b/egs/lre07/v1/lid/train_ivector_extractor.sh
index 8e238985f99..55bd54bb275 100755
--- a/egs/lre07/v1/lid/train_ivector_extractor.sh
+++ b/egs/lre07/v1/lid/train_ivector_extractor.sh
@@ -13,7 +13,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -24,8 +24,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -84,7 +84,7 @@ nj_full=$[$nj*$num_processes]
 sdata=$data/split$nj_full;
 utils/split_data.sh $data $nj_full || exit 1;
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 
 feats="ark,s,cs:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:$sdata/JOB/feats.scp ark:- | add-deltas-sdc ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
@@ -97,7 +97,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -135,27 +135,25 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
     echo "Updating model (pass $x)"
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    $cmd --num-threads $nt $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
 
+$cleanup && rm $dir/post.*.gz
 rm $dir/final.ie 2>/dev/null
 ln -s $x.ie $dir/final.ie
diff --git a/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh b/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
index 7464ce5faea..573258e7b88 100755
--- a/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
+++ b/egs/lre07/v1/lid/train_ivector_extractor_dnn.sh
@@ -9,16 +9,16 @@
 
 # This script trains the i-vector extractor using a DNN-based UBM. It also requires
 # an fGMM, created by the script lid/init_full_gmm_from_dnn.sh.
-# Note: there are 3 separate levels of parallelization: num_threads, num_processes, 
-# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing 
-# memory usage and disk I/O, subject to various constraints.  The "num_threads" 
+# Note: there are 3 separate levels of parallelization: num_threads, num_processes,
+# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing
+# memory usage and disk I/O, subject to various constraints.  The "num_threads"
 # is how many threads a program uses; the "num_processes" is the number of separate
 # processes a single  job spawns, and then sums the accumulators in memory.
 # Our recommendation:
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -29,8 +29,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -95,9 +95,9 @@ utils/split_data.sh $data $nj_full || exit 1;
 
 sdata_dnn=$data_dnn/split$nj_full;
 utils/split_data.sh $data_dnn $nj_full || exit 1;
-     
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 
 # Set up features.
 
@@ -114,7 +114,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1;
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -153,24 +153,21 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
-	echo "Updating model (pass $x)"
-        nt=$[$num_threads*$num_processes] # use the same number of threads that
-                                          # each accumulation process uses, since we
-                                          # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    echo "Updating model (pass $x)"
+    nt=$[$num_threads*$num_processes] # use the same number of threads that
+                                      # each accumulation process uses, since we
+                                      # can be sure the queue will support this many.
+    $cmd --num-threads $nt $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
diff --git a/egs/lre07/v1/run.sh b/egs/lre07/v1/run.sh
index a4ff4d909ba..8664494e558 100755
--- a/egs/lre07/v1/run.sh
+++ b/egs/lre07/v1/run.sh
@@ -127,12 +127,12 @@ utils/subset_data_dir.sh data/train 5000 data/train_5k
 utils/subset_data_dir.sh data/train 10000 data/train_10k
 
 
-lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+lid/train_diag_ubm.sh --nj 30 --cmd "$train_cmd --mem 20G" \
   data/train_5k 2048 exp/diag_ubm_2048
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd --mem 20G" \
   data/train_10k exp/diag_ubm_2048 exp/full_ubm_2048_10k
 
-lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd --mem 35G" \
   data/train exp/full_ubm_2048_10k exp/full_ubm_2048
 
 # Alternatively, a diagonal UBM can replace the full UBM used above.
@@ -148,7 +148,7 @@ lid/train_full_ubm.sh --nj 30 --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
 #gmm-global-to-fgmm exp/diag_ubm_2048/final.dubm \
 #  exp/full_ubm_2048/final.ubm
 
-lid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+lid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --use-weights true \
   --num-iters 5 exp/full_ubm_2048/final.ubm data/train \
   exp/extractor_2048
@@ -162,10 +162,10 @@ utils/fix_data_dir.sh data/train_lr
 echo "**Language count for logistic regression training (after splitting long utterances):**"
 awk '{print $2}' data/train_lr/utt2lang | sort | uniq -c | sort -nr
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/train_lr exp/ivectors_train
 
-lid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+lid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048 data/lre07 exp/ivectors_lre07
 
 lid/run_logistic_regression.sh --prior-scale 0.70 \
diff --git a/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh b/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh
index a223e12333f..51fcf401cb2 100755
--- a/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh
+++ b/egs/lre07/v2/local/dnn/run_nnet2_multisplice.sh
@@ -19,13 +19,13 @@ set -e
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
@@ -40,10 +40,10 @@ if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
     utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-data/egs/lre07/v2/$dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
diff --git a/egs/lre07/v2/run.sh b/egs/lre07/v2/run.sh
index 82be9a555ae..ff193274750 100755
--- a/egs/lre07/v2/run.sh
+++ b/egs/lre07/v2/run.sh
@@ -6,7 +6,7 @@
 # This script runs the NIST 2007 General Language Recognition Closed-Set
 # evaluation.
 # This example script shows how to replace the GMM-UBM
-# with a DNN trained for ASR. 
+# with a DNN trained for ASR.
 
 . cmd.sh
 . path.sh
@@ -157,12 +157,12 @@ utils/fix_data_dir.sh data/train_dnn_32k
 # Initialize a full GMM from the DNN posteriors and language recognition
 # features. This can be used both alone, as a UBM, or to initialize the
 # i-vector extractor in a DNN-based system.
-lid/init_full_ubm_from_dnn.sh --nj 40 --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
+lid/init_full_ubm_from_dnn.sh --nj 40 --cmd "$train_cmd --mem 6G" \
   data/train_32k \
   data/train_dnn_32k $nnet exp/full_ubm
 
 # Train an i-vector extractor based on the DNN-UBM.
-lid/train_ivector_extractor_dnn.sh --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
+lid/train_ivector_extractor_dnn.sh --cmd "$train_cmd --mem 80G" \
   --min-post 0.015 \
   --ivector-dim 600 \
   --num-iters 5 \
@@ -189,14 +189,14 @@ echo "**Language count for logistic regression training (after splitting long ut
 awk '{print $2}' data/train_lr_dnn/utt2lang | sort | uniq -c | sort -nr
 
 # Extract i-vectors using the extractor with the DNN-UBM
-lid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=30G,ram_free=30G" \ 
+lid/extract_ivectors_dnn.sh --cmd "$train_cmd --mem 30G" \
   --nj 40 exp/extractor_dnn \
   $nnet \
   data/train_lr \
   data/train_lr_dnn \
   exp/ivectors_train
 
-lid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=30G,ram_free=30G" \
+lid/extract_ivectors_dnn.sh --cmd "$train_cmd --mem 30G" \
   --nj 40 exp/extractor_dnn \
   $nnet \
   data/lre07 \
@@ -205,7 +205,7 @@ lid/extract_ivectors_dnn.sh --cmd "$train_cmd -l mem_free=30G,ram_free=30G" \
 
 # Train a logistic regression model on top of i-Vectors
 lid/run_logistic_regression.sh --prior-scale 0.70 \
-  --conf conf/logistic-regression.conf 
+  --conf conf/logistic-regression.conf
 
 # General LR 2007 closed-set eval
 local/lre07_eval/lre07_eval.sh exp/ivectors_lre07 \
diff --git a/egs/multi_en/s5/local/rt03_data_prep.sh b/egs/multi_en/s5/local/rt03_data_prep.sh
index 84955f0ed50..aa1e2ba4cc2 100755
--- a/egs/multi_en/s5/local/rt03_data_prep.sh
+++ b/egs/multi_en/s5/local/rt03_data_prep.sh
@@ -8,7 +8,7 @@
 #  - Modified paths to match multi_en naming conventions
 ###########################################################################################
 
-# RT-03 data preparation (conversational telephone speech part only) 
+# RT-03 data preparation (conversational telephone speech part only)
 # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 
 # To be run from one directory above this script.
@@ -16,7 +16,8 @@
 # Expects the standard directory layout for RT-03
 
 if [ $# -ne 1 ]; then
-  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "Usage: $0 <rt03-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
   echo "See comments in the script for more details"
   exit 1
 fi
@@ -45,7 +46,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -55,7 +56,7 @@ awk -v sph2pipe=$sph2pipe '{
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 #pem=$sdir/english/hub5e_00.pem
 #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
@@ -67,7 +68,7 @@ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | sort -u > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -85,7 +86,7 @@ cat $tdir/*.stm | \
   grep -v inter_segment_gap | \
   awk '{
            printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
-  > $dir/stm  
+  > $dir/stm
 #$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
 cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 
@@ -95,10 +96,10 @@ cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -118,4 +119,3 @@ done
 
 echo Data preparation and formatting completed for RT-03
 echo "(but not MFCC extraction)"
-
diff --git a/egs/rm/s5/local/chain/run_tdnn_5g.sh b/egs/rm/s5/local/chain/run_tdnn_5g.sh
index f6fbe070763..088cb3ec778 100755
--- a/egs/rm/s5/local/chain/run_tdnn_5g.sh
+++ b/egs/rm/s5/local/chain/run_tdnn_5g.sh
@@ -120,7 +120,7 @@ if [ $stage -le 8 ]; then
     --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
     --trainer.optimization.final-effective-lrate $final_effective_lrate \
     --trainer.max-param-change $max_param_change \
-    --cleanup.remove-egs true \
+    --cleanup.remove-egs $remove_egs \
     --feat-dir data/train \
     --tree-dir $treedir \
     --lat-dir exp/tri3b_lats \
diff --git a/egs/rm/s5/local/chain/run_tdnn_5n.sh b/egs/rm/s5/local/chain/run_tdnn_5n.sh
index 7fd7b82aa1d..7a08becd57f 100755
--- a/egs/rm/s5/local/chain/run_tdnn_5n.sh
+++ b/egs/rm/s5/local/chain/run_tdnn_5n.sh
@@ -25,7 +25,8 @@ num_jobs_final=4
 minibatch_size=128
 frames_per_eg=150
 remove_egs=false
-
+#common_egs_dir=exp/chain/tdnn_5g/egs/
+common_egs_dir=
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -121,7 +122,7 @@ if [ $stage -le 8 ]; then
     --trainer.optimization.initial-effective-lrate $initial_effective_lrate \
     --trainer.optimization.final-effective-lrate $final_effective_lrate \
     --trainer.max-param-change $max_param_change \
-    --cleanup.remove-egs true \
+    --cleanup.remove-egs $remove_egs \
     --feat-dir data/train \
     --tree-dir $treedir \
     --lat-dir exp/tri3b_lats \
diff --git a/egs/rm/s5/local/nnet2/run_4b_gpu.sh b/egs/rm/s5/local/nnet2/run_4b_gpu.sh
index 34a5cd34f7e..9cde9f1694e 100755
--- a/egs/rm/s5/local/nnet2/run_4b_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_4b_gpu.sh
@@ -16,7 +16,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.
 EOF
 
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 . utils/parse_options.sh  # to parse the --stage option, if given
 
diff --git a/egs/rm/s5/local/nnet2/run_4c.sh b/egs/rm/s5/local/nnet2/run_4c.sh
index 2b580fe29d6..b3060c46ca0 100755
--- a/egs/rm/s5/local/nnet2/run_4c.sh
+++ b/egs/rm/s5/local/nnet2/run_4c.sh
@@ -14,20 +14,20 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet4c_gpu
 else
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet4c
 fi
 
diff --git a/egs/rm/s5/local/nnet2/run_4d.sh b/egs/rm/s5/local/nnet2/run_4d.sh
index 69b0352744c..74db2a7702b 100755
--- a/egs/rm/s5/local/nnet2/run_4d.sh
+++ b/egs/rm/s5/local/nnet2/run_4d.sh
@@ -14,13 +14,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet4d_gpu
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet4d
 fi
 
diff --git a/egs/rm/s5/local/nnet2/run_4d2.sh b/egs/rm/s5/local/nnet2/run_4d2.sh
index 123b52f7590..426c623a502 100755
--- a/egs/rm/s5/local/nnet2/run_4d2.sh
+++ b/egs/rm/s5/local/nnet2/run_4d2.sh
@@ -12,13 +12,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet4d2_gpu
@@ -27,7 +27,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet4d2
 fi
 
diff --git a/egs/rm/s5/local/nnet2/run_4d3.sh b/egs/rm/s5/local/nnet2/run_4d3.sh
index a9ff7a9461f..f7cdc717176 100755
--- a/egs/rm/s5/local/nnet2/run_4d3.sh
+++ b/egs/rm/s5/local/nnet2/run_4d3.sh
@@ -16,19 +16,19 @@ dir=exp/nnet4d3
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
diff --git a/egs/rm/s5/local/nnet2/run_4e_gpu.sh b/egs/rm/s5/local/nnet2/run_4e_gpu.sh
index 0bdbadb745e..c9b34c0d625 100755
--- a/egs/rm/s5/local/nnet2/run_4e_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_4e_gpu.sh
@@ -2,7 +2,7 @@
 
 # This is GPU based pnorm neural net ensemble training on top of adapted 40-dimensional features.
 
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1" 
 
 . cmd.sh
 
diff --git a/egs/rm/s5/local/nnet2/run_5c.sh b/egs/rm/s5/local/nnet2/run_5c.sh
index 0146032a792..fcc6ee4b208 100755
--- a/egs/rm/s5/local/nnet2/run_5c.sh
+++ b/egs/rm/s5/local/nnet2/run_5c.sh
@@ -22,15 +22,15 @@ train_stage=-100
 nj=8
 
 if [ $stage -le 0 ]; then
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
     --beam 20.0 --lattice-beam 10.0 \
     --transform-dir exp/tri3b_ali \
     data/train data/lang exp/nnet4c exp/nnet4c_denlats
 fi
 
 if [ $stage -le 1 ]; then
-  steps/nnet2/align.sh  --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
+  steps/nnet2/align.sh  --cmd "$decode_cmd --mem 1G" \
     --transform-dir exp/tri3b_ali \
     --nj $nj data/train data/lang exp/nnet4c exp/nnet4c_ali
 fi
diff --git a/egs/rm/s5/local/nnet2/run_5c_gpu.sh b/egs/rm/s5/local/nnet2/run_5c_gpu.sh
index 770b1682524..219e2cb808e 100755
--- a/egs/rm/s5/local/nnet2/run_5c_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_5c_gpu.sh
@@ -7,21 +7,21 @@
 # at the end of the directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -29,8 +29,8 @@ EOF
 
 nj=$(cat exp/tri3b_ali/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --beam 20.0 --lattice-beam 10.0 \
       --transform-dir exp/tri3b_ali \
      data/train data/lang exp/nnet4c_gpu exp/nnet4c_gpu_denlats
diff --git a/egs/rm/s5/local/nnet2/run_5d.sh b/egs/rm/s5/local/nnet2/run_5d.sh
index 431cc6bb93b..56b102c80f5 100755
--- a/egs/rm/s5/local/nnet2/run_5d.sh
+++ b/egs/rm/s5/local/nnet2/run_5d.sh
@@ -27,14 +27,14 @@ nj_orig=$(cat $transform_dir/num_jobs)
 if $use_gpu; then
   . ./cmd.sh
   . ./path.sh
-  ! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  ! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
-  align_gpu_opts="-l gpu=1"
+  align_gpu_opts="--gpu 1"
   use_gpu_flag="--use-gpu yes"
-  train_parallel_opts="-l gpu=1"
+  train_parallel_opts="--gpu 1"
   train_num_threads=1
   srcdir=exp/nnet4d_gpu
   dir=exp/nnet5d_mpe_gpu
@@ -42,7 +42,7 @@ EOF
 else
   align_gpu_opts=
   use_gpu_flag="--use-gpu no"
-  train_parallel_opts="-pe smp 6"
+  train_parallel_opts="--num-threads 6"
   train_num_threads=6
   srcdir=exp/nnet4d
   dir=exp/nnet5d_mpe
@@ -64,7 +64,7 @@ fi
 # wasteful since the lattice determinization and graph search use up a fair
 # amount of CPU, and we'd be idling the GPU much of the time.
 
-# We specify 1G each for the mem_free and ram_free which, is per thread... it
+# We specify 1G each for --mem, which is per thread... it
 # will likely be less than the default.  Increase the beam relative to the
 # defaults; this is just for this RM setup, where the default beams will likely
 # generate very thin lattices.
@@ -74,8 +74,8 @@ fi
 
 
 if [ $stage -le 0 ]; then
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
     --beam 20.0 --lattice-beam 10.0 \
     --transform-dir $transform_dir \
     data/train data/lang $srcdir ${srcdir}_denlats
diff --git a/egs/rm/s5/local/nnet2/run_5d_gpu.sh b/egs/rm/s5/local/nnet2/run_5d_gpu.sh
index 209327fd43f..f83cd3db20a 100755
--- a/egs/rm/s5/local/nnet2/run_5d_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_5d_gpu.sh
@@ -7,21 +7,21 @@
 # at the end of the directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -29,8 +29,8 @@ EOF
 
 nj=$(cat exp/tri3b_ali/num_jobs)
 dir=nnet4d_gpu
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --beam 20.0 --lattice-beam 10.0 \
       --transform-dir exp/tri3b_ali \
      data/train data/lang exp/$dir exp/$dir_denlats
diff --git a/egs/rm/s5/local/nnet2/run_5e_gpu.sh b/egs/rm/s5/local/nnet2/run_5e_gpu.sh
index 11b85b2bef7..37c9fb4238d 100755
--- a/egs/rm/s5/local/nnet2/run_5e_gpu.sh
+++ b/egs/rm/s5/local/nnet2/run_5e_gpu.sh
@@ -7,21 +7,21 @@
 # at the end of the directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -29,8 +29,8 @@ EOF
 
 nj=$(cat exp/tri3b_ali/num_jobs)
 dir=nnet4e_gpu
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --beam 20.0 --lattice-beam 10.0 \
       --transform-dir exp/tri3b_ali \
      data/train data/lang exp/$dir exp/$dir_denlats
diff --git a/egs/rm/s5/local/online/run_nnet2.sh b/egs/rm/s5/local/online/run_nnet2.sh
index 2ab5fb5ffaf..18d66730640 100755
--- a/egs/rm/s5/local/online/run_nnet2.sh
+++ b/egs/rm/s5/local/online/run_nnet2.sh
@@ -15,13 +15,13 @@ dir=exp/nnet2_online/nnet_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -119,7 +119,7 @@ exit 0;
 %WER 2.20 [ 276 / 12533, 37 ins, 61 del, 178 sub ] exp/nnet2_online/nnet_a/decode/wer_5
 %WER 10.22 [ 1281 / 12533, 143 ins, 193 del, 945 sub ] exp/nnet2_online/nnet_a/decode_ug/wer_10
 
-# This is the baseline with spliced non-CMVN cepstra and no iVector input. 
+# This is the baseline with spliced non-CMVN cepstra and no iVector input.
 # The difference is pretty small on RM; I expect it to be more clear-cut on larger corpora.
 %WER 2.30 [ 288 / 12533, 35 ins, 57 del, 196 sub ] exp/nnet2_online/nnet_gpu_baseline/decode/wer_5
 %WER 10.98 [ 1376 / 12533, 121 ins, 227 del, 1028 sub ] exp/nnet2_online/nnet_gpu_baseline/decode_ug/wer_10
diff --git a/egs/rm/s5/local/online/run_nnet2_baseline.sh b/egs/rm/s5/local/online/run_nnet2_baseline.sh
index f8c31a132a3..b44be65142b 100755
--- a/egs/rm/s5/local/online/run_nnet2_baseline.sh
+++ b/egs/rm/s5/local/online/run_nnet2_baseline.sh
@@ -20,13 +20,13 @@ dir=exp/nnet2_online/nnet_a_baseline
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -34,7 +34,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
diff --git a/egs/rm/s5/local/online/run_nnet2_common.sh b/egs/rm/s5/local/online/run_nnet2_common.sh
index c9227797c63..1cd8abfba54 100755
--- a/egs/rm/s5/local/online/run_nnet2_common.sh
+++ b/egs/rm/s5/local/online/run_nnet2_common.sh
@@ -11,13 +11,13 @@ stage=1
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -25,7 +25,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet
 fi
 
diff --git a/egs/rm/s5/local/online/run_nnet2_multisplice.sh b/egs/rm/s5/local/online/run_nnet2_multisplice.sh
index 4a39113c8b2..813d69c6d32 100755
--- a/egs/rm/s5/local/online/run_nnet2_multisplice.sh
+++ b/egs/rm/s5/local/online/run_nnet2_multisplice.sh
@@ -15,13 +15,13 @@ dir=exp/nnet2_online/nnet_ms_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -29,7 +29,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -117,7 +117,7 @@ exit 0;
 
 
 # see ../../RESULTS for results.  It's about the same as the non-multisplice
-# recipe, but I'm not doing much tuning on RM... it has too little data 
+# recipe, but I'm not doing much tuning on RM... it has too little data
 # for any of these DNN things to really work well
 
 
diff --git a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
index 2bddefdac04..9aff25b569d 100755
--- a/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
+++ b/egs/rm/s5/local/online/run_nnet2_multisplice_disc.sh
@@ -26,19 +26,19 @@ fi
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ $stage -le 1 ]; then
@@ -46,7 +46,7 @@ if [ $stage -le 1 ]; then
   # otherwise on RM we'd get very thin lattices.
   nj=30
   num_threads_denlats=6
-  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --nj $nj --sub-split 40 --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train data/lang $srcdir ${srcdir}_denlats || exit 1;
 fi
@@ -63,7 +63,7 @@ fi
 
 
 if [ $stage -le 3 ]; then
-  # I tested the following with  --max-temp-archives 3 
+  # I tested the following with  --max-temp-archives 3
   # to test other branches of the code.
   # the --max-jobs-run 5 limits the I/O.
   steps/online/nnet2/get_egs_discriminative2.sh \
diff --git a/egs/rm/s5/local/online/run_nnet2_perturbed.sh b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
index c018ca2880b..6f304cab59f 100755
--- a/egs/rm/s5/local/online/run_nnet2_perturbed.sh
+++ b/egs/rm/s5/local/online/run_nnet2_perturbed.sh
@@ -17,13 +17,13 @@ dir=exp/nnet2_online/nnet_perturbed
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
@@ -31,7 +31,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -44,7 +44,7 @@ if [ $stage -le 1 ]; then
 fi
 
 if [ $stage -le 2 ]; then
-  # Note: if you've already run run_online_decoding_nnet2.sh you can 
+  # Note: if you've already run run_online_decoding_nnet2.sh you can
   # skip this stage.
   # use a smaller iVector dim (50) than the default (100) because RM has a very
   # small amount of data.
@@ -79,7 +79,7 @@ if [ $stage -le 5 ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/rm/s5/$ivectordir $ivectordir/storage
   fi
   # Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
-  # of speakers into "fake-speakers" with about 2 utterances each, by randomly making 
+  # of speakers into "fake-speakers" with about 2 utterances each, by randomly making
   # some have 2 and some 3 utterances... this randomness will be different in different
   # copies of the data.
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
diff --git a/egs/rm/s5/local/online/run_nnet2_wsj.sh b/egs/rm/s5/local/online/run_nnet2_wsj.sh
index 1e5c5d10343..e22d450a387 100755
--- a/egs/rm/s5/local/online/run_nnet2_wsj.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj.sh
@@ -19,13 +19,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet2_online_wsj/nnet_a
@@ -37,7 +37,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online_wsj/nnet_a
   trainfeats=exp/nnet2_online_wsj/wsj_activations_train
   srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
@@ -58,7 +58,7 @@ fi
 
 if [ $stage -le 1 ]; then
   echo "$0: training 0-hidden-layer model on top of WSJ activations"
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then    
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
   fi
 
@@ -70,7 +70,7 @@ if [ $stage -le 1 ]; then
     --num-jobs-nnet 4 \
     --mix-up 4000 \
     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
-     $trainfeats/data data/lang exp/tri3b_ali $dir 
+     $trainfeats/data data/lang exp/tri3b_ali $dir
 fi
 
 if [ $stage -le 2 ]; then
@@ -100,10 +100,10 @@ fi
 
 ## From this point on we try something else: we try training all the layers of
 ## the model on this dataset.  First we need to create a combined version of the
-## model. 
+## model.
 if [ $stage -le 5 ]; then
   steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
-  
+
   # Set the learning rate in this initial value to our guess of a suitable value.
   # note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
   # (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
@@ -112,7 +112,7 @@ if [ $stage -le 5 ]; then
 fi
 
 if [ $stage -le 6 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then    
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
       /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
         $dir_combined/egs/storage
@@ -129,7 +129,7 @@ if [ $stage -le 7 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined 
+     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined
 fi
 
 if [ $stage -le 8 ]; then
@@ -159,9 +159,9 @@ fi
 exit 0;
 
 # Here are the results when we just retrain the last layer:
-# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh 
+# grep WER exp/nnet2_online_wsj/nnet_a_online/decode/wer_* | utils/best_wer.sh
 #%WER 1.60 [ 201 / 12533, 22 ins, 46 del, 133 sub ] exp/nnet2_online_wsj/nnet_a_online/decode/wer_3
-#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 8.02 [ 1005 / 12533, 74 ins, 155 del, 776 sub ] exp/nnet2_online_wsj/nnet_a_online/decode_ug/wer_6
 
 # and with per-utterance decoding:
@@ -179,7 +179,7 @@ exit 0;
 # %WER 7.86 [ 985 / 12533, 59 ins, 171 del, 755 sub ] exp/nnet2_online_wsj/nnet_a_combined_online/decode_ug_per_utt/wer_8
 
 # And this is a suitable baseline: a system trained on RM only.
-#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode/wer_* | utils/best_wer.sh
 #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_a_online/decode/wer_8
-#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_a_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_11
diff --git a/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh b/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh
index 5e8a2a8ab64..8064e8f31ac 100755
--- a/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj_joint.sh
@@ -5,8 +5,8 @@
 # Before running this script, go to ../../wsj/s5, and after running
 # the earlier stages in the run.sh (so the baseline SAT system is built),
 # run the following:
-# 
-# local/online/run_nnet2.sh --stage 8 --dir exp/nnet2_online/nnet_ms_a_partial --exit-train-stage 15    
+#
+# local/online/run_nnet2.sh --stage 8 --dir exp/nnet2_online/nnet_ms_a_partial --exit-train-stage 15
 #
 # (you may want to keep --stage 8 on the above command line after run_nnet2.sh,
 # in case you already ran some scripts in local/online/ in ../../wsj/s5/ and
@@ -29,24 +29,24 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # Check inputs.
 for f in $srcdir/egs/egs.1.ark $srcdir/egs/info/egs_per_archive \
-    ${srcdir}_online/final.mdl $src_alidir/ali.1.gz; do 
+    ${srcdir}_online/final.mdl $src_alidir/ali.1.gz; do
   [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1;
 done
 
@@ -59,7 +59,7 @@ if [ $stage -le 0 ]; then
   steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
 fi
 
-if [ $stage -le 1 ]; then 
+if [ $stage -le 1 ]; then
   echo "$0: dumping egs for RM data"
   steps/online/nnet2/get_egs2.sh --cmd "$train_cmd" \
     data/train_max2 exp/tri3b_ali ${srcdir}_online ${dir}/egs
diff --git a/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh b/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh
index 891409dcd1f..15b039b8ac9 100755
--- a/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh
+++ b/egs/rm/s5/local/online/run_nnet2_wsj_joint_disc.sh
@@ -3,11 +3,11 @@
 
 # this script is discriminative training after multi-language training (as
 # run_nnet2_gale_combined_disc1.sh), but the discriminative training is
-# multi-language too. 
+# multi-language too.
 # some of the stages are the same as run_nnet2_gale_combined_disc1.sh,
 # and we didn't repeat them (we used the --stage option, it defaults to 4).
 
-# This script is to be run after run_nnet2_gale_combined.sh.  
+# This script is to be run after run_nnet2_gale_combined.sh.
 # It's discriminative training, using just the BOLT data.
 # note, the _filt data has some bad conversations removed, that
 # weren't aligning.
@@ -38,7 +38,7 @@ if [ $stage -le 1 ]; then
   sub_split=100
   num_threads=6
 
-  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads" \
+  steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads" \
       --nj $nj --sub-split $sub_split --num-threads "$num_threads" \
      $data_wsj $lang_wsj ${dir}_wsj_online ${dir}_wsj_denlats
 fi
@@ -73,7 +73,7 @@ if [ $stage -le 4 ]; then
   num_threads=6
 
   steps/online/nnet2/make_denlats.sh \
-      --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads" \
+      --cmd "$decode_cmd --mem 1G --num-threads $num_threads" \
       --nj $nj --sub-split $sub_split --num-threads "$num_threads" \
      $data_rm $lang_rm  ${dir}_rm_online ${dir}_rm_denlats
 fi
@@ -102,8 +102,8 @@ if [ $stage -le 6 ]; then
 fi
 
 if [ $stage -le 7 ]; then
-  
-  steps/nnet2/train_discriminative_multilang2.sh --cmd "$decode_cmd -l gpu=1" --stage $train_stage \
+
+  steps/nnet2/train_discriminative_multilang2.sh --cmd "$decode_cmd --gpu 1" --stage $train_stage \
     --learning-rate $learning_rate --num-jobs-nnet "4 1" \
     --criterion $criterion --drop-frames $drop_frames \
     --num-epochs $num_epochs --num-threads 1 \
diff --git a/egs/rm/s5/local/run_dnn_convert_nnet2.sh b/egs/rm/s5/local/run_dnn_convert_nnet2.sh
index 96fd0bc2193..664ecf3f80b 100755
--- a/egs/rm/s5/local/run_dnn_convert_nnet2.sh
+++ b/egs/rm/s5/local/run_dnn_convert_nnet2.sh
@@ -51,7 +51,7 @@ steps/nnet2/decode.sh --nj 10 --cmd "$decode_cmd" \
 
  # options here are for GPU use.
   steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
-    --parallel-opts "-l gpu=1" --num-threads 1  --minibatch-size 512 \
+    --parallel-opts "--gpu 1" --num-threads 1  --minibatch-size 512 \
     exp/dnn4b_nnet2/final.mdl.mod exp/dnn4b_nnet2_retrain/egs exp/dnn4b_nnet2_retrain
 
   steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --transform-dir exp/tri3b/decode \
@@ -91,7 +91,7 @@ steps/nnet2/decode.sh --nj 10 --cmd "$decode_cmd" \
       exp/dnn4b_nnet2_dbn_retrain
 
   steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
-    --parallel-opts "-l gpu=1" --num-threads 1  --minibatch-size 512 \
+    --parallel-opts "--gpu 1" --num-threads 1  --minibatch-size 512 \
     exp/dnn4b_nnet2_dbn/final.mdl.mod exp/dnn4b_nnet2_dbn_retrain/egs exp/dnn4b_nnet2_dbn_retrain
 
 
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh
index 8f858da739d..4f88b2334f4 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5b_gpu.sh
@@ -59,7 +59,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet2/train_block.sh --stage "$train_stage" \
      --num-threads 1 --max-change 40.0 --minibatch-size 512 --num-jobs-nnet 8 \
-     --parallel-opts "-l gpu=1" \
+     --parallel-opts "--gpu 1" \
      --initial-learning-rate 0.0075 --final-learning-rate 0.00075 \
      --num-epochs 10 --num-epochs-extra 5 \
      --cmd "$decode_cmd" \
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh
index 55017386f08..00bd16bf00f 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5c2_gpu.sh
@@ -9,7 +9,7 @@
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5c2_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
@@ -18,7 +18,7 @@ dir=exp/nnet5c2_gpu
 . ./cmd.sh
 . utils/parse_options.sh
 
-( 
+(
 
   if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then
     mkdir -p $dir
@@ -33,7 +33,7 @@ dir=exp/nnet5c2_gpu
    --num-hidden-layers 4 --hidden-layer-dim 1024 \
    --cmd "$decode_cmd" \
     data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
-  
+
   steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
      exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh
index 4aaafde4eb5..2bf13a0a399 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5c_gpu.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5c_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
@@ -15,7 +15,7 @@ dir=exp/nnet5c_gpu
 . ./cmd.sh
 . utils/parse_options.sh
 
-( 
+(
 
   if [ ! -z "$temp_dir" ] && [ ! -e $dir/egs ]; then
     mkdir -p $dir
@@ -30,7 +30,7 @@ dir=exp/nnet5c_gpu
    --num-hidden-layers 4 --hidden-layer-dim 1024 \
    --cmd "$decode_cmd" \
     data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
-  
+
   steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
     --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
      exp/tri4b/graph_bd_tgpr data/test_dev93 $dir/decode_bd_tgpr_dev93
diff --git a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
index f52a0028074..1b87fec6419 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_5d_gpu.sh
@@ -5,7 +5,7 @@
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5d_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
diff --git a/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh b/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh
index eee51fd9c9b..e61843ef4b4 100755
--- a/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/sprakbanken/s5/local/nnet2/run_6c_gpu.sh
@@ -7,7 +7,7 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"  # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
@@ -15,7 +15,7 @@ gpu_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network,
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -25,8 +25,8 @@ set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5c_gpu exp/nnet5c_gpu_denlats
 
diff --git a/egs/sre08/v1/local/run_more_data.sh b/egs/sre08/v1/local/run_more_data.sh
index db7f14615a8..001e7ff4d23 100755
--- a/egs/sre08/v1/local/run_more_data.sh
+++ b/egs/sre08/v1/local/run_more_data.sh
@@ -49,7 +49,7 @@ sid/compute_vad_decision.sh --nj 4 --cmd "$train_cmd" data/sre08_test_short3_mal
 
 
 # Note: to see the proportion of voiced frames you can do,
-# grep Prop exp/make_vad/vad_*.1.log 
+# grep Prop exp/make_vad/vad_*.1.log
 
 # Get male and female subsets of training data.
 grep -w m data/train/spk2gender | awk '{print $1}' > foo;
@@ -78,20 +78,20 @@ sid/train_full_ubm.sh --nj 30 --remove-low-count-gaussians false --num-iters 1 -
    data/train_female_4k exp/full_ubm_2048 exp/full_ubm_2048_female &
 wait
 
-# note, the mem_free,ram_free is counted per thread... in this setup each
+# note, the --mem is counted per thread... in this setup each
 # job has 4 processes running each with 4 threads; each job takes about 5G
 # of memory so we need about 20G, plus add memory for sum-accs to make it 25G.
-# but we'll submit using -pe smp 16, and this multiplies the memory requirement
+# but we'll submit using --num-threads 16, and this multiplies the memory requirement
 # by 16, so submitting with 2G as the requirement, to make the total requirement
 # 32, is reasonable.
 
 # Train the iVector extractor for male speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
   --num-iters 5 exp/full_ubm_2048_male/final.ubm data/train_male \
   exp/extractor_2048_male
 
 # The same for female speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=2G,ram_free=2G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 2G" \
   --num-iters 5 exp/full_ubm_2048_female/final.ubm data/train_female \
   exp/extractor_2048_female
 
@@ -105,22 +105,22 @@ sid/gender_id.sh --cmd "$train_cmd" --nj 150 exp/full_ubm_2048{,_male,_female} \
 # Gender-id error rate is 2.58%
 
 # Extract the iVectors for the Fisher data.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_male data/train_male exp/ivectors_train_male
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_female data/train_female exp/ivectors_train_female
 
 # .. and for the SRE08 training and test data. (We focus on the main
 # evaluation condition, the only required one in that eval, which is
 # the short2-short3 eval.)
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_female data/sre08_train_short2_female exp/ivectors_sre08_train_short2_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_male data/sre08_train_short2_male exp/ivectors_sre08_train_short2_male
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_female data/sre08_test_short3_female exp/ivectors_sre08_test_short3_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=3G,ram_free=3G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 3G" --nj 50 \
    exp/extractor_2048_male data/sre08_test_short3_male exp/ivectors_sre08_test_short3_male
 
 
@@ -131,7 +131,7 @@ cat $trials | awk '{print $1, $2}' | \
  ivector-compute-dot-products - \
   scp:exp/ivectors_sre08_train_short2_female/spk_ivector.scp \
   scp:exp/ivectors_sre08_test_short3_female/spk_ivector.scp \
-   foo 
+   foo
 
 local/score_sre08.sh $trials foo
 
diff --git a/egs/sre08/v1/run.sh b/egs/sre08/v1/run.sh
index 4e31542bf4d..c4afe447e8d 100755
--- a/egs/sre08/v1/run.sh
+++ b/egs/sre08/v1/run.sh
@@ -110,12 +110,12 @@ sid/train_full_ubm.sh --nj 30 --remove-low-count-gaussians false \
 wait
 
 # Train the iVector extractor for male speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --num-iters 5 exp/full_ubm_2048_male/final.ubm data/train_male \
   exp/extractor_2048_male
 
 # The same for female speakers.
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --num-iters 5 exp/full_ubm_2048_female/final.ubm data/train_female \
   exp/extractor_2048_female
 
@@ -129,25 +129,25 @@ sid/gender_id.sh --cmd "$train_cmd" --nj 150 exp/full_ubm_2048{,_male,_female} \
 # Gender-id error rate is 3.41%
 
 # Extract the iVectors for the training data.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_male data/train_male exp/ivectors_train_male
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_female data/train_female exp/ivectors_train_female
 
 # .. and for the SRE08 training and test data. (We focus on the main
 # evaluation condition, the only required one in that eval, which is
 # the short2-short3 eval.)
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_female data/sre08_train_short2_female \
   exp/ivectors_sre08_train_short2_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_male data/sre08_train_short2_male \
   exp/ivectors_sre08_train_short2_male
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_female data/sre08_test_short3_female \
   exp/ivectors_sre08_test_short3_female
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 50 \
   exp/extractor_2048_male data/sre08_test_short3_male \
   exp/ivectors_sre08_test_short3_male
 
diff --git a/egs/sre08/v1/sid/extract_ivectors_dnn.sh b/egs/sre08/v1/sid/extract_ivectors_dnn.sh
index 8692e6ee8a5..2687d1fc6c8 100755
--- a/egs/sre08/v1/sid/extract_ivectors_dnn.sh
+++ b/egs/sre08/v1/sid/extract_ivectors_dnn.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Copyright     2013  Daniel Povey
-#          2014-2015  David Snyder
+#          2014-2017  David Snyder
 #               2015  Johns Hopkins University (Author: Daniel Garcia-Romero)
 #               2015  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -16,6 +16,9 @@ stage=0
 min_post=0.025 # Minimum posterior to use (posteriors below this are pruned out)
 posterior_scale=1.0 # This scale helps to control for successive features being highly
                     # correlated.  E.g. try 0.1 or 0.3.
+use_gpu=true
+chunk_size=256
+nnet_job_opt=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -37,6 +40,8 @@ if [ $# != 5 ]; then
   echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
   echo "                                                   # diagonal model."
   echo "  --min-post <min-post|0.025>                      # Pruning threshold for posteriors"
+  echo "  --nnet-job-opt <option|''>                       # Options for the DNN jobs which add to or"
+  echo "                                                   # replace those specified by --cmd"
   exit 1;
 fi
 
@@ -46,6 +51,21 @@ data=$3
 data_dnn=$4
 dir=$5
 
+gpu_opt=""
+if $use_gpu; then
+  nnet_job_opt="$nnet_job_opt --gpu 1"
+  gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are trying to use the GPU but you have not compiled"
+    echo "   for CUDA.  If you have GPUs and have nvcc installed, go to src/"
+    echo "   and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be slow."
+  gpu_opt="--use-gpu=no"
+fi
+
 for f in $srcdir/final.ie $srcdir/final.ubm $data/feats.scp ; do
   [ ! -f $f ] && echo "No such file $f" && exit 1;
 done
@@ -60,8 +80,6 @@ utils/split_data.sh $data_dnn $nj || exit 1;
 
 delta_opts=`cat $srcdir/delta_opts 2>/dev/null`
 
-splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options           
-
 ## Set up features.
 feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
@@ -69,13 +87,18 @@ nnet_feats="ark,s,cs:apply-cmvn-sliding --center=true scp:$sdata_dnn/JOB/feats.s
 
 if [ $stage -le 0 ]; then
   echo "$0: extracting iVectors"
-  $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \
-    nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
-    \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
-    \| logprob-to-post --min-post=$min_post ark:- ark:- \| \
-    scale-post ark:- $posterior_scale ark:- \| \
-    ivector-extract --verbose=2 $srcdir/final.ie "$feats" ark,s,cs:- \
-      ark,scp,t:$dir/ivector.JOB.ark,$dir/ivector.JOB.scp || exit 1;
+  for g in $(seq $nj); do
+    $cmd $nnet_job_opt $dir/log/extract_ivectors.$g.log \
+      nnet-am-compute $gpu_opt --apply-log=true --chunk-size=${chunk_size} \
+        $nnet "`echo $nnet_feats | sed s/JOB/$g/g`" ark:- \
+        \| select-voiced-frames ark:- scp,s,cs:$sdata/$g/vad.scp ark:- \
+        \| logprob-to-post --min-post=$min_post ark:- ark:- \| \
+        scale-post ark:- $posterior_scale ark:- \| \
+        ivector-extract --verbose=2 $srcdir/final.ie \
+        "`echo $feats | sed s/JOB/$g/g`" ark,s,cs:- \
+        ark,scp,t:$dir/ivector.$g.ark,$dir/ivector.$g.scp || exit 1 &
+  done
+  wait
 fi
 
 if [ $stage -le 1 ]; then
diff --git a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
index f6710028ae5..c6b508a7206 100755
--- a/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
+++ b/egs/sre08/v1/sid/init_full_ubm_from_dnn.sh
@@ -1,18 +1,23 @@
 #!/bin/bash
-# Copyright 2015   David Snyder
-#           2015   Johns Hopkins University (Author: Daniel Garcia-Romero)
-#           2015   Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2015-2017   David Snyder
+#           2015        Johns Hopkins University (Author: Daniel Garcia-Romero)
+#           2015        Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0
 
 # This script derives a full-covariance UBM from DNN posteriors and
 # speaker recognition features.
 
 # Begin configuration section.
-nj=40
+nj=8
 cmd="run.pl"
 stage=-2
 delta_window=3
 delta_order=2
+use_gpu=true
+nnet_job_opt=
+cleanup=true
+chunk_size=256
+stage=0
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -30,15 +35,34 @@ if [ $# != 4 ]; then
   echo "  --nj <n|16>                                      # number of parallel training jobs"
   echo "  --delta-window <n|3>                             # delta window size"
   echo "  --delta-order <n|2>                              # delta order"
-  echo "                                                   # to be equal to the size of the DNN output layer."
+  echo "  --use-gpu <true/false>                           # Use GPU to extract DNN posteriors"
+  echo "  --chunk-size <n|256>                             # Number of frames processed at a time by the DNN"
+  echo "  --nnet-job-opt <option|''>                       # Options for the DNN jobs which add to or"
+  echo "                                                   # replace those specified by --cmd"
   exit 1;
 fi
 
-data=$1
-data_dnn=$2
+data=$1     # Features for the GMM
+data_dnn=$2 # Features for the DNN
 nnet=$3
 dir=$4
 
+gpu_opt=""
+nnet_job_opt=""
+if $use_gpu; then
+  nnet_job_opt="$nnet_job_opt --gpu 1"
+  gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are trying to use the GPU but you have not compiled"
+    echo "   for CUDA.  If you have GPUs and have nvcc installed, go to src/"
+    echo "   and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be slow."
+  gpu_opt="--use-gpu=no"
+fi
+
 
 for f in $data/feats.scp $data/vad.scp ${data_dnn}/feats.scp \
     ${data_dnn}/vad.scp $nnet; do
@@ -69,16 +93,34 @@ select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 # in the ancillary GMM.
 num_components=`grep -oP 'output-dim\ \K[0-9]+' <(nnet-am-info $nnet 2> /dev/null)`
 
-$cmd JOB=1:$nj $logdir/make_stats.JOB.log \
-  nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
-  \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
-  \| logprob-to-post ark:- ark:- \| \
-  fgmm-global-acc-stats-post ark:- $num_components "$feats" \
-  $dir/stats.JOB.acc || exit 1;
+if [ $stage -le 0 ]; then
+  echo "$0: accumulating stats from DNN posteriors and speaker ID features"
+  for g in $(seq $nj); do
+    $cmd $nnet_job_opt $dir/log/make_stats.$g.log \
+    nnet-am-compute $gpu_opt \
+      --chunk-size=${chunk_size} --apply-log=true $nnet \
+      "`echo $nnet_feats | sed s/JOB/$g/g`" \
+      ark:- \
+      \| select-voiced-frames ark:- scp,s,cs:$sdata/$g/vad.scp ark:- \
+      \| logprob-to-post ark:- ark:- \| \
+      fgmm-global-acc-stats-post ark:- $num_components \
+      "`echo $feats | sed s/JOB/$g/g`" \
+      $dir/stats.$g.acc || exit 1 &
+  done
+  wait
+fi
 
-$cmd $dir/log/init.log \
-  fgmm-global-init-from-accs --verbose=2 \
-  "fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
-  $dir/final.ubm || exit 1;
+if [ $stage -le 1 ]; then
+  echo "$0: initializing GMM from stats"
+  $cmd $dir/log/init.log \
+    fgmm-global-init-from-accs --verbose=2 \
+    "fgmm-global-sum-accs - $dir/stats.*.acc |" $num_components \
+    $dir/final.ubm || exit 1;
+fi
 
-exit 0;
+if $cleanup; then
+  echo "$0: removing stats"
+  for g in $(seq $nj); do
+    rm $dir/stats.$g.acc || exit 1
+  done
+fi
diff --git a/egs/sre10/v1/local/dnn/get_egs2.sh b/egs/sre08/v1/sid/nnet2/get_egs2.sh
similarity index 98%
rename from egs/sre10/v1/local/dnn/get_egs2.sh
rename to egs/sre08/v1/sid/nnet2/get_egs2.sh
index 9f1644178e2..05ea1d1a0cd 100755
--- a/egs/sre10/v1/local/dnn/get_egs2.sh
+++ b/egs/sre08/v1/sid/nnet2/get_egs2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).
 #                2015 David Snyder
 # Apache 2.0.
 #
@@ -54,7 +54,7 @@ transform_dir=     # If supplied, overrides alidir as the place to find fMLLR tr
 postdir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 
@@ -83,7 +83,7 @@ if [ $# != 3 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -109,7 +109,7 @@ utils/split_data.sh $data $nj
 mkdir -p $dir/log $dir/info
 cp $alidir/tree $dir
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 
@@ -129,7 +129,7 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
-## Set up features. 
+## Set up features.
 if [ -z $feat_type ]; then
   if [ -f $alidir/final.mat ] && [ ! -f $transform_dir/raw_trans.1 ]; then feat_type=lda; else feat_type=raw; fi
 fi
@@ -140,7 +140,7 @@ case $feat_type in
     valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding --center=true  scp:- ark:- |"
     train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     # caution: the top-level nnet training script should copy these to its own dir now.
     cp $alidir/{splice_opts,final.mat} $dir || exit 1;
@@ -280,13 +280,13 @@ if [ $stage -le 3 ]; then
     egs_list="$egs_list ark:$dir/egs_orig.$n.JOB.ark"
   done
   echo "$0: Generating training examples on disk"
-  # The examples will go round-robin to egs_list. 
+  # The examples will go round-robin to egs_list.
   if [ ! -z $postdir ]; then
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       scp:$postdir/post.JOB.scp ark:- \| \
       nnet-copy-egs ark:- $egs_list || exit 1;
-  else 
+  else
     $cmd $io_opts JOB=1:$nj $dir/log/get_egs.JOB.log \
       nnet-get-egs $ivectors_opt $nnet_context_opts --num-frames=$frames_per_eg "$feats" \
       "ark,s,cs:gunzip -c $alidir/ali.JOB.gz | ali-to-pdf $alidir/final.mdl ark:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
@@ -299,7 +299,7 @@ if [ $stage -le 4 ]; then
   # shuffle the order, writing to the egs.JOB.ark
 
   egs_list=
-  for n in $(seq $nj); do 
+  for n in $(seq $nj); do
     egs_list="$egs_list $dir/egs_orig.JOB.$n.ark"
   done
 
diff --git a/egs/sre10/v1/local/dnn/get_lda.sh b/egs/sre08/v1/sid/nnet2/get_lda.sh
similarity index 99%
rename from egs/sre10/v1/local/dnn/get_lda.sh
rename to egs/sre08/v1/sid/nnet2/get_lda.sh
index 253222ff271..89594a20f84 100755
--- a/egs/sre10/v1/local/dnn/get_lda.sh
+++ b/egs/sre08/v1/sid/nnet2/get_lda.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012 Johns Hopkins University (Author: Daniel Povey).
 #           2015 David Snyder
 # Apache 2.0.
 #
@@ -108,7 +108,7 @@ N=$[$num_feats/$nj]
 case $feat_type in
   raw) feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- |"
    ;;
-  lda) 
+  lda)
     splice_opts=`cat $alidir/splice_opts 2>/dev/null`
     cp $alidir/{splice_opts,final.mat} $dir || exit 1;
      feats="ark,s,cs:utils/subset_scp.pl --quiet $N $sdata/JOB/feats.scp | apply-cmvn-sliding --center=true scp:- ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
@@ -144,7 +144,7 @@ fi
 echo $ivector_dim >$dir/ivector_dim
 
 if [ -z "$lda_dim" ]; then
-  spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"  
+  spliced_feats_one="$(echo "$spliced_feats" | sed s:JOB:1:g)"
   lda_dim=$(feat-to-dim "$spliced_feats_one" -) || exit 1;
 fi
 
diff --git a/egs/sre10/v1/local/dnn/train_multisplice_accel2.sh b/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
similarity index 96%
rename from egs/sre10/v1/local/dnn/train_multisplice_accel2.sh
rename to egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
index f5441d6e967..c56e89b5d94 100755
--- a/egs/sre10/v1/local/dnn/train_multisplice_accel2.sh
+++ b/egs/sre08/v1/sid/nnet2/train_multisplice_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -8,7 +8,7 @@
 # Apache 2.0.
 
 # This is a modified version of train_multisplice_accel2.sh in
-# steps/nnet2/ for speaker recognition. The main difference is
+# ../../steps/nnet2/ for speaker recognition. The main difference is
 # that it uses different get_lda.sh and get_egs2.sh scripts.
 #
 # The original train_multisplice_accel2.sh was a modified version of
@@ -25,11 +25,11 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -66,7 +66,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -78,11 +78,11 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
@@ -92,7 +92,7 @@ transform_dir=     # If supplied, overrides alidir
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -127,10 +127,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
   echo "                                                   # as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -148,7 +148,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -201,7 +201,7 @@ extra_opts+=(--transform-dir $transform_dir)
 
 if [ $stage -le -4 ]; then
   echo "$0: calling get_lda.sh"
-  local/dnn/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
+  sid/nnet2/get_lda.sh $lda_opts "${extra_opts[@]}" --left-context $first_left_context --right-context $first_right_context --cmd "$cmd" $data $lang $alidir $dir || exit 1;
 fi
 # these files will have been written by get_lda.sh
 feat_dim=$(cat $dir/feat_dim) || exit 1;
@@ -213,7 +213,7 @@ if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--left-context $nnet_left_context )
   extra_opts+=(--right-context $nnet_right_context )
   echo "$0: calling get_egs2.sh"
-  local/dnn/get_egs2.sh $egs_opts "${extra_opts[@]}" \
+  sid/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" \
       --samples-per-iter $samples_per_iter --stage $get_egs_stage \
       --io-opts "$io_opts" \
       --cmd "$cmd" $egs_opts \
@@ -372,7 +372,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -417,7 +417,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -461,7 +461,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -500,7 +500,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -537,7 +537,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/sre08/v1/sid/train_diag_ubm.sh b/egs/sre08/v1/sid/train_diag_ubm.sh
index 6ff1a9099d9..1e79fc10c99 100755
--- a/egs/sre08/v1/sid/train_diag_ubm.sh
+++ b/egs/sre08/v1/sid/train_diag_ubm.sh
@@ -60,7 +60,7 @@ if [ $# != 3 ]; then
   echo "                                                   # in initialization phase (then split)"
   echo " --num-threads <n|32>                              # number of threads to use in initialization"
   echo "                                                   # phase (must match with parallel-opts option)"
-  echo " --parallel-opts <string|'-pe smp 32'>             # Option should match number of threads in"
+  echo " --parallel-opts <string|'--num-threads 32'>             # Option should match number of threads in"
   echo "                                                   # --num-threads option above"
   echo " --min-gaussian-weight <weight|0.0001>             # min Gaussian weight allowed in GMM"
   echo "                                                   # initialization (this relatively high"
@@ -85,7 +85,7 @@ for f in $data/feats.scp $data/vad.scp; do
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1
 done
 
-parallel_opts="-pe smp $num_threads"
+parallel_opts="--num-threads $num_threads"
 delta_opts="--delta-window=$delta_window --delta-order=$delta_order"
 echo $delta_opts > $dir/delta_opts
 
@@ -135,10 +135,11 @@ for x in `seq 0 $[$num_iters-1]`; do
     $cmd $dir/log/update.$x.log \
       gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
       $dir/$[$x+1].dubm || exit 1;
-    rm $dir/$x.*.acc $dir/$x.dubm
+    $cleanup && rm $dir/$x.*.acc $dir/$x.dubm
   fi
 done
 
-rm $dir/gselect.*.gz
+$cleanup && rm $dir/gselect.*.gz
+
 mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
 exit 0;
diff --git a/egs/sre08/v1/sid/train_ivector_extractor.sh b/egs/sre08/v1/sid/train_ivector_extractor.sh
index 5d7eb984485..68ba0ca65fd 100755
--- a/egs/sre08/v1/sid/train_ivector_extractor.sh
+++ b/egs/sre08/v1/sid/train_ivector_extractor.sh
@@ -13,7 +13,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -24,8 +24,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -90,7 +90,7 @@ if [ -f $srcdir/delta_opts ]; then
   cp $srcdir/delta_opts $dir/ 2>/dev/null
 fi
 
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
@@ -102,7 +102,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -140,26 +140,24 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
     echo "Updating model (pass $x)"
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
-
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
 ln -s $x.ie $dir/final.ie
diff --git a/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh b/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
index 64579735376..2ce915a0750 100755
--- a/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
+++ b/egs/sre08/v1/sid/train_ivector_extractor_dnn.sh
@@ -1,23 +1,23 @@
 #!/bin/bash
 
 # Copyright 2013  Daniel Povey
-#      2014-2015  David Snyder
+#      2014-2017  David Snyder
 #           2015  Johns Hopkins University (Author: Daniel Garcia-Romero)
 #           2015  Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
 
 # This script trains the i-vector extractor using a DNN-based UBM. It also requires
 # an fGMM, usually created by the script sid/init_full_gmm_from_dnn.sh.
-# Note: there are 3 separate levels of parallelization: num_threads, num_processes, 
-# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing 
-# memory usage and disk I/O, subject to various constraints.  The "num_threads" 
+# Note: there are 3 separate levels of parallelization: num_threads, num_processes,
+# and num_jobs.  This may seem a bit excessive.  It has to do with minimizing
+# memory usage and disk I/O, subject to various constraints.  The "num_threads"
 # is how many threads a program uses; the "num_processes" is the number of separate
 # processes a single  job spawns, and then sums the accumulators in memory.
 # Our recommendation:
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -28,12 +28,12 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
-        # run is nj * num_processes * num_threads, and the number of
-        # separate pieces of data is nj * num_processes.
+nj=5   # this is the number of separate queue jobs we run, but each one
+       # contains num_processes sub-jobs.. the real number of threads we
+       # run is nj * num_processes * num_threads, and the number of
+       # separate pieces of data is nj * num_processes.
 num_threads=4
-num_processes=4 # each job runs this many processes, each with --num-threads threads
+num_processes=2 # each job runs this many processes, each with --num-threads threads
 cmd="run.pl"
 stage=-4
 num_gselect=20 # Gaussian-selection using diagonal model: number of Gaussians to select
@@ -46,6 +46,9 @@ cleanup=true
 posterior_scale=1.0 # This scale helps to control for successve features being highly
                     # correlated.  E.g. try 0.1 or 0.3
 sum_accs_opt=
+use_gpu=true
+chunk_size=256
+nnet_job_opt=
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -71,6 +74,9 @@ if [ $# != 5 ]; then
   echo "                                                   # diagonal model."
   echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
   echo "                                                   # sum-accs process to nfs server."
+  echo "  --nnet-job-opt <option|''>                       # Options for the DNN jobs which add to or"
+  echo "                                                   # replace those specified by --cmd"
+  echo "  --chunk-size <n|256>                             # Number of frames processed at a time by the DNN"
   exit 1;
 fi
 
@@ -80,6 +86,21 @@ data=$3
 data_dnn=$4
 dir=$5
 
+gpu_opt=""
+if $use_gpu; then
+  nnet_job_opt="$nnet_job_opt --gpu 1"
+  gpu_opt="--use-gpu=yes"
+  if ! cuda-compiled; then
+    echo "$0: WARNING: you are trying to use the GPU but you have not compiled"
+    echo "   for CUDA.  If you have GPUs and have nvcc installed, go to src/"
+    echo "   and do ./configure; make"
+    exit 1
+  fi
+else
+  echo "$0: without using a GPU this will be slow."
+  gpu_opt="--use-gpu=no"
+fi
+
 srcdir=$(dirname $fgmm_model)
 
 for f in $fgmm_model $data/feats.scp ; do
@@ -100,9 +121,7 @@ if [ -f $srcdir/delta_opts ]; then
   cp $srcdir/delta_opts $dir/ 2>/dev/null
 fi
 
-splice_opts=`cat exp/nnet//splice_opts 2>/dev/null` # frame-splicing options           
-
-parallel_opts="-pe smp $[$num_threads*$num_processes]"
+parallel_opts="--num-threads $[$num_threads*$num_processes]"
 ## Set up features.
 feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"
 
@@ -117,19 +136,24 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=$use_weights \
      $dir/final.ubm $dir/0.ie || exit 1;
-fi 
+fi
 
 # Do Gaussian selection and posterior extraction
 
 if [ $stage -le -1 ]; then
   echo $nj_full > $dir/num_jobs
   echo "$0: doing DNN posterior computation"
-  $cmd JOB=1:$nj_full $dir/log/post.JOB.log \
-  nnet-am-compute --apply-log=true $nnet "$nnet_feats" ark:- \
-  \| select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- \
-  \| logprob-to-post --min-post=$min_post ark,s,cs:- ark:- \| \
-  scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.JOB.gz" || exit 1;
-
+  for g in $(seq $nj_full); do
+    $cmd $nnet_job_opt $dir/log/post.$g.log \
+      nnet-am-compute $gpu_opt \
+        --chunk-size=${chunk_size} --apply-log=true $nnet \
+        "`echo $nnet_feats | sed s/JOB/$g/g`" \
+        ark:- \
+        \| select-voiced-frames ark:- scp,s,cs:$sdata/$g/vad.scp ark:- \
+        \| logprob-to-post ark:- ark:- \
+        \| scale-post ark:- $posterior_scale "ark:|gzip -c >$dir/post.$g.gz" || exit 1 &
+  done
+  wait
 else
   if ! [ $nj_full -eq $(cat $dir/num_jobs) ]; then
     echo "Num-jobs mismatch $nj_full versus $(cat $dir/num_jobs)"
@@ -156,26 +180,25 @@ while [ $x -lt $num_iters ]; do
     done
     wait
     [ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
-	accs=""
-	for j in $(seq $nj); do
-	  accs+="$dir/acc.$x.$j "
-	done
-	echo "Summing accs (pass $x)"
-	$cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
-	  ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
+    accs=""
+    for j in $(seq $nj); do
+      accs+="$dir/acc.$x.$j "
+    done
+    echo "Summing accs (pass $x)"
+    $cmd $sum_accs_opt $dir/log/sum_acc.$x.log \
+      ivector-extractor-sum-accs $accs $dir/acc.$x || exit 1;
     echo "Updating model (pass $x)"
     nt=$[$num_threads*$num_processes] # use the same number of threads that
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
-	$cmd -pe smp $nt $dir/log/update.$x.log \
-	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
-	rm $dir/acc.$x.*
-    if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
-    fi
+    $cmd $parallel_opts $dir/log/update.$x.log \
+      ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
+    rm $dir/acc.$x.*
+    $cleanup && rm $dir/acc.$x $dir/$x.ie
   fi
   x=$[$x+1]
 done
 
+$cleanup && rm -f $dir/post.*.gz
+rm -f $dir/final.ie
 ln -s $x.ie $dir/final.ie
diff --git a/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh b/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
index 684cc8ddfc0..97b9789af0c 100755
--- a/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
+++ b/egs/sre10/v1/local/dnn/run_nnet2_multisplice.sh
@@ -4,56 +4,52 @@
 # egs/fisher_english/s5/local/online. It has been modified
 # for speaker recognition.
 
-. cmd.sh
-
-
 stage=1
 train_stage=-10
 use_gpu=true
 set -e
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
 
+. utils/parse_options.sh
 
 # assume use_gpu=true since it would be way too slow otherwise.
 
 if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 fi
-parallel_opts="-l gpu=1" 
+parallel_opts="--gpu 1"
 num_threads=1
 minibatch_size=512
 dir=exp/nnet2_online/nnet_ms_a
 mkdir -p exp/nnet2_online
 
-
 # Stages 1 through 5 are done in run_nnet2_common.sh,
 # so it can be shared with other similar scripts.
 local/dnn/run_nnet2_common.sh --stage $stage
 
 if [ $stage -le 6 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
-    utils/create_split_dir.pl /export/b0{6,7,8,9}/$(USER)/kaldi-dsata/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
+    utils/create_split_dir.pl /export/b0{6,7,8,9}/$USER/kaldi-data/egs/fisher_english/s5/$dir/egs/storage $dir/egs/storage
   fi
-  
+
   # Because we have a lot of data here and we don't want the training to take
   # too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
-  # 1).  The option "--io-opts '-tc 12'" is to have more than the default number
+  # 1).  The option "--io-opts '--max-jobs-run 12'" is to have more than the default number
   # (5) of jobs dumping the egs to disk; this is OK since we're splitting our
   # data across four filesystems for speed.
 
 
-  local/dnn/train_multisplice_accel2.sh --stage $train_stage \
+  sid/nnet2/train_multisplice_accel2.sh --stage $train_stage \
     --feat-type raw \
     --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer3/-3:3 layer4/-7:2" \
     --num-epochs 6 \
     --num-hidden-layers 6 \
-    --num-jobs-initial 3 --num-jobs-final 18 \
+    --num-jobs-initial 3 --num-jobs-final 8 \
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
diff --git a/egs/sre10/v1/local/dnn/train_dnn.sh b/egs/sre10/v1/local/dnn/train_dnn.sh
index d9330e58b69..33aed9abdc7 100755
--- a/egs/sre10/v1/local/dnn/train_dnn.sh
+++ b/egs/sre10/v1/local/dnn/train_dnn.sh
@@ -10,7 +10,6 @@
 . path.sh
 mfccdir=`pwd`/mfcc
 set -e
-
 # the next command produces the data in local/train_all_asr
 local/dnn/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
    /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
@@ -169,5 +168,5 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 # this will help find issues with the lexicon.
 # steps/cleanup/debug_lexicon.sh --nj 300 --cmd "$train_cmd" data/train_asr_100k data/lang exp/tri5a data/local/dict/lexicon.txt exp/debug_lexicon_100k
 
-## The following is based on the best current neural net recipe.
+# The following is based on an older nnet2 recipe.
 local/dnn/run_nnet2_multisplice.sh
diff --git a/egs/sre10/v1/local/plda_scoring.sh b/egs/sre10/v1/local/plda_scoring.sh
index ef17edb4e05..63d4a4f0d4c 100755
--- a/egs/sre10/v1/local/plda_scoring.sh
+++ b/egs/sre10/v1/local/plda_scoring.sh
@@ -5,6 +5,10 @@
 # This script trains PLDA models and does scoring.
 
 use_existing_models=false
+simple_length_norm=false # If true, replace the default length normalization
+                         # performed in PLDA  by an alternative that
+                         # normalizes the length of the iVectors to be equal
+                         # to the square root of the iVector dimension.
 
 echo "$0 $@"  # Print the command line for logging
 
@@ -38,9 +42,10 @@ fi
 mkdir -p $scores_dir/log
 
 run.pl $scores_dir/log/plda_scoring.log \
-   ivector-plda-scoring --normalize-length=true \
-   --num-utts=ark:${enroll_ivec_dir}/num_utts.ark \
-   "ivector-copy-plda --smoothing=0.0 ${plda_ivec_dir}/plda - |" \
-   "ark:ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec scp:${enroll_ivec_dir}/spk_ivector.scp ark:- | ivector-normalize-length ark:- ark:- |" \
-   "ark:ivector-normalize-length scp:${test_ivec_dir}/ivector.scp ark:- | ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
-   "cat '$trials' | cut -d\  --fields=1,2 |" $scores_dir/plda_scores || exit 1;
+  ivector-plda-scoring --normalize-length=true \
+    --simple-length-normalization=$simple_length_norm \
+    --num-utts=ark:${enroll_ivec_dir}/num_utts.ark \
+    "ivector-copy-plda --smoothing=0.0 ${plda_ivec_dir}/plda - |" \
+    "ark:ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec scp:${enroll_ivec_dir}/spk_ivector.scp ark:- | ivector-normalize-length ark:- ark:- |" \
+    "ark:ivector-normalize-length scp:${test_ivec_dir}/ivector.scp ark:- | ivector-subtract-global-mean ${plda_ivec_dir}/mean.vec ark:- ark:- | ivector-normalize-length ark:- ark:- |" \
+    "cat '$trials' | cut -d\  --fields=1,2 |" $scores_dir/plda_scores || exit 1;
diff --git a/egs/sre10/v1/run.sh b/egs/sre10/v1/run.sh
index ead66c3e160..4c5049a73bc 100755
--- a/egs/sre10/v1/run.sh
+++ b/egs/sre10/v1/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2015-2016   David Snyder
+# Copyright 2015-2017   David Snyder
 #                2015   Johns Hopkins University (Author: Daniel Garcia-Romero)
 #                2015   Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -71,30 +71,30 @@ utils/subset_data_dir.sh data/train 16000 data/train_16k
 utils/subset_data_dir.sh data/train 32000 data/train_32k
 
 # Train UBM and i-vector extractor.
-sid/train_diag_ubm.sh --cmd "$train_cmd -l mem_free=20G,ram_free=20G" \
+sid/train_diag_ubm.sh --cmd "$train_cmd --mem 20G" \
   --nj 20 --num-threads 8 \
   data/train_16k $num_components \
   exp/diag_ubm_$num_components
 
 sid/train_full_ubm.sh --nj 40 --remove-low-count-gaussians false \
-  --cmd "$train_cmd -l mem_free=25G,ram_free=25G" data/train_32k \
+  --cmd "$train_cmd --mem 25G" data/train_32k \
   exp/diag_ubm_$num_components exp/full_ubm_$num_components
 
-sid/train_ivector_extractor.sh --cmd "$train_cmd -l mem_free=35G,ram_free=35G" \
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 35G" \
   --ivector-dim 600 \
   --num-iters 5 exp/full_ubm_$num_components/final.ubm data/train \
   exp/extractor
 
 # Extract i-vectors.
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
   exp/extractor data/sre10_train \
   exp/ivectors_sre10_train
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
   exp/extractor data/sre10_test \
   exp/ivectors_sre10_test
 
-sid/extract_ivectors.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" --nj 50 \
+sid/extract_ivectors.sh --cmd "$train_cmd --mem 6G" --nj 40 \
   exp/extractor data/sre \
   exp/ivectors_sre
 
@@ -108,27 +108,28 @@ local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
 # best, so we don't focus on the scores obtained here.
 #
 # local/cosine_scoring.sh data/sre10_train data/sre10_test \
-#  exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
+#  exp/ivectors_sre10_train exp/ivectors_sre10_test $trials exp/scores_gmm_2048_ind_pooled
 # local/lda_scoring.sh data/sre data/sre10_train data/sre10_test \
-#  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
+#  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials exp/scores_gmm_2048_ind_pooled
 
 # Create a gender independent PLDA model and do scoring.
 local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
-  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials local/scores_gmm_2048_ind_pooled
+  exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test $trials exp/scores_gmm_2048_ind_pooled
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
-  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female local/scores_gmm_2048_ind_female
+  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female exp/scores_gmm_2048_ind_female
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
-  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male local/scores_gmm_2048_ind_male
+  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male exp/scores_gmm_2048_ind_male
 
 # Create gender dependent PLDA models and do scoring.
 local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
-  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female local/scores_gmm_2048_dep_female
+  exp/ivectors_sre exp/ivectors_sre10_train_female exp/ivectors_sre10_test_female $trials_female exp/scores_gmm_2048_dep_female
 local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
-  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male local/scores_gmm_2048_dep_male
+  exp/ivectors_sre exp/ivectors_sre10_train_male exp/ivectors_sre10_test_male $trials_male exp/scores_gmm_2048_dep_male
 
-mkdir -p local/scores_gmm_2048_dep_pooled
-cat local/scores_gmm_2048_dep_male/plda_scores local/scores_gmm_2048_dep_female/plda_scores \
-  > local/scores_gmm_2048_dep_pooled/plda_scores
+# Pool the gender dependent results.
+mkdir -p exp/scores_gmm_2048_dep_pooled
+cat exp/scores_gmm_2048_dep_male/plda_scores exp/scores_gmm_2048_dep_female/plda_scores \
+  > exp/scores_gmm_2048_dep_pooled/plda_scores
 
 # GMM-2048 PLDA EER
 # ind pooled: 2.26
@@ -140,7 +141,7 @@ cat local/scores_gmm_2048_dep_male/plda_scores local/scores_gmm_2048_dep_female/
 echo "GMM-$num_components EER"
 for x in ind dep; do
   for y in female male pooled; do
-    eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_gmm_${num_components}_${x}_${y}/plda_scores) 2> /dev/null`
+    eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores_gmm_${num_components}_${x}_${y}/plda_scores) 2> /dev/null`
     echo "${x} ${y}: $eer"
   done
 done
diff --git a/egs/sre10/v2/cmd.sh b/egs/sre10/v2/cmd.sh
index 5c38b3a5d77..fe4cd0bcb3f 100755
--- a/egs/sre10/v2/cmd.sh
+++ b/egs/sre10/v2/cmd.sh
@@ -6,10 +6,10 @@
 # the number of cpus on your machine.
 
 #a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+#export cuda_cmd="queue --gpu 1"
+export mkgraph_cmd="queue.pl --mem 4G"
 
 #b) BUT cluster options
 #export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
diff --git a/egs/sre10/v2/run.sh b/egs/sre10/v2/run.sh
index 4f5ab2756bb..b6c24fc1371 100755
--- a/egs/sre10/v2/run.sh
+++ b/egs/sre10/v2/run.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2015-2016   David Snyder
+# Copyright 2015-2017   David Snyder
 #                2015   Johns Hopkins University (Author: Daniel Garcia-Romero)
 #                2015   Johns Hopkins University (Author: Daniel Povey)
 # Apache 2.0.
@@ -105,62 +105,61 @@ utils/fix_data_dir.sh data/train_32k
 # Initialize a full GMM from the DNN posteriors and speaker recognition
 # features. This can be used both alone, as a UBM, or to initialize the
 # i-vector extractor in a DNN-based system.
-sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd -l mem_free=6G,ram_free=6G" \
+sid/init_full_ubm_from_dnn.sh --cmd "$train_cmd --mem 15G" \
   data/train_32k \
   data/train_dnn_32k $nnet exp/full_ubm
 
 # Train an i-vector extractor based on just the supervised-GMM.
 sid/train_ivector_extractor.sh \
-  --cmd "$train_cmd -l mem_free=70G,ram_free=70G" \
+  --cmd "$train_cmd --mem 120G" \
   --ivector-dim 600 \
   --num-iters 5 exp/full_ubm/final.ubm data/train \
   exp/extractor_sup_gmm
 
 # Train an i-vector extractor based on the DNN-UBM.
 sid/train_ivector_extractor_dnn.sh \
-  --cmd "$train_cmd -l mem_free=80G,ram_free=80G" \
-  --min-post 0.015 \
-  --ivector-dim 600 \
-  --num-iters 5 exp/full_ubm/final.ubm $nnet \
+  --cmd "$train_cmd --mem 100G" --nnet-job-opt "--mem 4G" \
+  --min-post 0.015 --ivector-dim 600 --num-iters 5 \
+  exp/full_ubm/final.ubm $nnet \
   data/train \
   data/train_dnn \
   exp/extractor_dnn
 
 # Extract i-vectors from the extractor with the sup-GMM UBM.
 sid/extract_ivectors.sh \
-  --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
+  --cmd "$train_cmd --mem 12G" --nj 40 \
   exp/extractor_sup_gmm data/sre10_train \
   exp/ivectors_sre10_train_sup_gmm
 
 sid/extract_ivectors.sh \
-  --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
+  --cmd "$train_cmd --mem 12G" --nj 40 \
   exp/extractor_sup_gmm data/sre10_test \
   exp/ivectors_sre10_test_sup_gmm
 
 sid/extract_ivectors.sh \
-  --cmd "$train_cmd -l mem_free=8G,ram_free=8G" --nj 40 \
+  --cmd "$train_cmd --mem 12G" --nj 40 \
   exp/extractor_sup_gmm data/sre \
   exp/ivectors_sre_sup_gmm
 
 # Extract i-vectors using the extractor with the DNN-UBM.
 sid/extract_ivectors_dnn.sh \
-  --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+  --cmd "$train_cmd --mem 15G" --nj 10 \
   exp/extractor_dnn \
   $nnet \
   data/sre10_test \
   data/sre10_test_dnn \
   exp/ivectors10_test_dnn
 
-sid/extract_ivectors_dnn.sh
-  --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh \
+   --cmd "$train_cmd --mem 15G" --nj 10 \
   exp/extractor_dnn \
   $nnet \
   data/sre10_train \
   data/sre10_train_dnn \
   exp/ivectors10_train_dnn
 
-sid/extract_ivectors_dnn.sh
-  --cmd "$train_cmd -l mem_free=10G,ram_free=10G" --nj 40 \
+sid/extract_ivectors_dnn.sh \
+  --cmd "$train_cmd --mem 15G" --nj 10 \
   exp/extractor_dnn \
   $nnet \
   data/sre \
@@ -183,87 +182,90 @@ local/scoring_common.sh data/sre data/sre10_train data/sre10_test \
 #
 # local/cosine_scoring.sh data/sre10_train data/sre10_test \
 #   exp/ivectors_sre10_train exp/ivectors_sre10_test $trials \
-#   local/scores_gmm_2048_ind_pooled
+#   exp/scores_gmm_2048_ind_pooled
 # local/lda_scoring.sh data/sre data/sre10_train data/sre10_test \
 #   exp/ivectors_sre exp/ivectors_sre10_train exp/ivectors_sre10_test \
-#   $trials local/scores_gmm_2048_ind_pooled
+#   $trials exp/scores_gmm_2048_ind_pooled
 
 # Create a gender independent PLDA model and do scoring with the sup-GMM system.
 local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm \
-  exp/ivectors_sre10_test_sup_gmm $trials local/scores_sup_gmm_ind_pooled
+  exp/ivectors_sre10_test_sup_gmm $trials exp/scores_sup_gmm_ind_pooled
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
-  exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_ind_female
+  exp/ivectors_sre10_test_sup_gmm_female $trials_female exp/scores_sup_gmm_ind_female
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
-  exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_ind_male
+  exp/ivectors_sre10_test_sup_gmm_male $trials_male exp/scores_sup_gmm_ind_male
 
 # Create gender dependent PLDA models and do scoring with the sup-GMM system.
 local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_female \
-  exp/ivectors_sre10_test_sup_gmm_female $trials_female local/scores_sup_gmm_dep_female
+  exp/ivectors_sre10_test_sup_gmm_female $trials_female exp/scores_sup_gmm_dep_female
 local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_sup_gmm exp/ivectors_sre10_train_sup_gmm_male \
-  exp/ivectors_sre10_test_sup_gmm_male $trials_male local/scores_sup_gmm_dep_male
-mkdir -p local/scores_sup_gmm_dep_pooled
-cat local/scores_sup_gmm_dep_male/plda_scores local/scores_sup_gmm_dep_female/plda_scores \
-  > local/scores_sup_gmm_dep_pooled/plda_scores
+  exp/ivectors_sre10_test_sup_gmm_male $trials_male exp/scores_sup_gmm_dep_male
+
+# Pool the gender dependent results
+mkdir -p exp/scores_sup_gmm_dep_pooled
+cat exp/scores_sup_gmm_dep_male/plda_scores exp/scores_sup_gmm_dep_female/plda_scores \
+  > exp/scores_sup_gmm_dep_pooled/plda_scores
 
 # Create a gender independent PLDA model and do scoring with the DNN system.
 local/plda_scoring.sh data/sre data/sre10_train data/sre10_test \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn \
-  exp/ivectors_sre10_test_dnn $trials local/scores_dnn_ind_pooled
+  exp/ivectors_sre10_test_dnn $trials exp/scores_dnn_ind_pooled
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
-  exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_ind_female
+  exp/ivectors_sre10_test_dnn_female $trials_female exp/scores_dnn_ind_female
 local/plda_scoring.sh --use-existing-models true data/sre data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
-  exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_ind_male
+  exp/ivectors_sre10_test_dnn_male $trials_male exp/scores_dnn_ind_male
 
 # Create gender dependent PLDA models and do scoring with the DNN system.
 local/plda_scoring.sh data/sre_female data/sre10_train_female data/sre10_test_female \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_female \
-  exp/ivectors_sre10_test_dnn_female $trials_female local/scores_dnn_dep_female
+  exp/ivectors_sre10_test_dnn_female $trials_female exp/scores_dnn_dep_female
 local/plda_scoring.sh data/sre_male data/sre10_train_male data/sre10_test_male \
   exp/ivectors_sre_dnn exp/ivectors_sre10_train_dnn_male \
-  exp/ivectors_sre10_test_dnn_male $trials_male local/scores_dnn_dep_male
-mkdir -p local/scores_dnn_dep_pooled
-cat local/scores_dnn_dep_male/plda_scores local/scores_dnn_dep_female/plda_scores \
-  > local/scores_dnn_dep_pooled/plda_scores
+  exp/ivectors_sre10_test_dnn_male $trials_male exp/scores_dnn_dep_male
+
+mkdir -p exp/scores_dnn_dep_pooled
+cat exp/scores_dnn_dep_male/plda_scores exp/scores_dnn_dep_female/plda_scores \
+  > exp/scores_dnn_dep_pooled/plda_scores
 
 # Sup-GMM PLDA EER
 # ind pooled: 1.72
 # ind female: 1.81
-# ind male:   1.56
-# dep female: 1.89
-# dep male:   1.39
-# dep pooled: 1.65
-echo "Sup-GMM-$num_components EER"
+# ind male:   1.70
+# dep female: 2.03
+# dep male:   1.50
+# dep pooled: 1.79
+echo "Sup-GMM EER"
 for x in ind dep; do
   for y in female male pooled; do
-    eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_sup_gmm_${x}_${y}/plda_scores) 2> /dev/null`
+    eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores_sup_gmm_${x}_${y}/plda_scores) 2> /dev/null`
     echo "${x} ${y}: $eer"
   done
 done
 
-# DNN PLDA EER
-# ind pooled: 1.05
-# ind female: 1.33
-# ind male:   0.75
-# dep female: 1.41
-# dep male:   0.64
-# dep pooled: 1.02
-echo "DNN-$num_components EER"
+# DNN-UBM EER
+# ind pooled: 1.01
+# ind female: 1.16
+# ind male:   0.78
+# dep female: 1.27
+# dep male:   0.61
+# dep pooled: 0.96
+echo "DNN-UBM EER"
 for x in ind dep; do
   for y in female male pooled; do
-    eer=`compute-eer <(python local/prepare_for_eer.py $trials local/scores_dnn_${x}_${y}/plda_scores) 2> /dev/null`
+    eer=`compute-eer <(python local/prepare_for_eer.py $trials exp/scores_dnn_${x}_${y}/plda_scores) 2> /dev/null`
     echo "${x} ${y}: $eer"
   done
 done
 
 # In comparison, here is the EER for an unsupervised GMM-based system
-# with 5297 components (the same as the number of senones in the DNN):
+# with 5297 components (about the same as the number of senones in the DNN):
 # GMM-5297 PLDA EER
 # ind pooled: 2.25
 # ind female: 2.33
diff --git a/egs/swahili/s5/cmd.sh b/egs/swahili/s5/cmd.sh
index ab1c23f76ef..8c9422b92bc 100755
--- a/egs/swahili/s5/cmd.sh
+++ b/egs/swahili/s5/cmd.sh
@@ -1,5 +1,5 @@
 # JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export cuda_cmd="queue --gpu 1"
+export mkgraph_cmd="queue.pl --mem 4G"
diff --git a/egs/swbd/README.txt b/egs/swbd/README.txt
index fc61a4c3060..1da570274e4 100644
--- a/egs/swbd/README.txt
+++ b/egs/swbd/README.txt
@@ -10,11 +10,14 @@ About the Switchboard corpus
     We are using the eval2000 a.k.a. hub5'00 evaluation data.  The acoustics are
     LDC2002S09 and the text is LDC2002T43.
 
+    We are also using the RT'03 test set, available as LDC2007S10.  Note: not
+    all parts of the recipe test with this.
+
 About the Fisher corpus for language modeling
 
   We use Fisher English training speech transcripts for language modeling, if
   they are available. The catalog number for part 1 transcripts is LDC2004T19,
-  and LDC2005T19 for part 2. 
+  and LDC2005T19 for part 2.
 
 Each subdirectory of this directory contains the
 scripts for a sequence of experiments.
@@ -24,4 +27,3 @@ scripts for a sequence of experiments.
   s5b: This is (somewhat less) out of date, please see s5c
 
   s5c: This is the current recipe.
-
diff --git a/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh
index 940c99538cb..3aae7918964 100755
--- a/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5a_gpu.sh
@@ -18,7 +18,7 @@ EOF
 
 . utils/parse_options.sh
 
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ ! -f exp/nnet5a_gpu/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh
index 50f79208897..74058d9fac4 100755
--- a/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5b_gpu.sh
@@ -66,7 +66,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet2/train_block.sh --stage "$train_stage" \
      --num-threads 1 --max-change 40.0 --minibatch-size 512 \
-     --parallel-opts "-l gpu=1" \
+     --parallel-opts "--gpu 1" \
      --initial-learning-rate 0.01 --final-learning-rate 0.001 \
      --num-epochs 10 --num-epochs-extra 5 \
      --cmd "$decode_cmd" \
diff --git a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
index 36f72b77083..55becfbe0fc 100755
--- a/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5c_gpu.sh
@@ -20,7 +20,7 @@ EOF
 
 ( 
   if [ ! -f exp/nnet5c_gpu/final.mdl ]; then
-    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "--max-jobs-run 5" \
+    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "--gpu 1" --io-opts "--max-jobs-run 5" \
       --num-threads 1 --minibatch-size 512 --max-change 40.0 --mix-up 20000 --samples-per-iter 300000 \
       --num-epochs 10 --num-epochs-extra 3 --initial-learning-rate 0.0067 --final-learning-rate 0.00067 \
       --num-jobs-nnet 10 --num-hidden-layers 5 --hidden-layer-dim 1536 data/train_nodup data/lang \
diff --git a/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh
index 5364f14bcb6..e0b523910df 100755
--- a/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5d_gpu.sh
@@ -18,7 +18,7 @@ EOF
 
 
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ ! -f exp/$dir/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh
index 545c80c0e1c..77de59b90ff 100755
--- a/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5e_gpu.sh
@@ -18,7 +18,7 @@ train_stage=-10
 
 
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
   if [ ! -f exp/$dir/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh b/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh
index 3cc315a9775..b91599a27e6 100755
--- a/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_5f_gpu.sh
@@ -9,7 +9,7 @@ dir=nnet5f_gpu
 . ./cmd.sh
 . ./path.sh
 . utils/parse_options.sh
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 ( 
    if [ ! -f exp/$dir/final.mdl ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh b/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh
index 712c8e79c5b..6327ee85224 100755
--- a/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_6a_gpu.sh
@@ -21,7 +21,7 @@ EOF
 
 . utils/parse_options.sh
 
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 
 alidir=exp/nnet5a_ali_100k_nodup
 if [ ! -f $alidir/.done ]; then
diff --git a/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh b/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh
index 8324051279b..0296f4cca00 100755
--- a/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh
+++ b/egs/swbd/s5b/local/nnet2/run_6c_gpu.sh
@@ -7,7 +7,7 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1"                   # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"                   # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
@@ -18,8 +18,8 @@ set -e # exit on error.
 
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -28,15 +28,15 @@ EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
 # specify, since this system is on top of fMLLR features.
 
 if [ $stage -le 0 ]; then
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+    --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
     --transform-dir exp/tri4b \
     data/train_nodup data/lang exp/nnet5c_gpu exp/nnet5c_gpu_denlats
 fi
@@ -59,7 +59,7 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  for epoch in 1 2 3 4; do 
+  for epoch in 1 2 3 4; do
     for lm_suffix in tg fsh_tgpr; do
       steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter epoch$epoch \
         --config conf/decode.config --transform-dir exp/tri4b/decode_eval2000_sw1_${lm_suffix} \
diff --git a/egs/swbd/s5b/local/online/run_nnet2.sh b/egs/swbd/s5b/local/online/run_nnet2.sh
index ae788d333db..679829fe84e 100755
--- a/egs/swbd/s5b/local/online/run_nnet2.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2.sh
@@ -11,13 +11,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -26,7 +26,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 dir=exp/nnet2_online/nnet_a
 
@@ -42,7 +42,7 @@ if [ $stage -le 1 ]; then
       --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir;
   steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir;
 
-  # Remove the small number of utterances that couldn't be extracted for some 
+  # Remove the small number of utterances that couldn't be extracted for some
   # reason (e.g. too short; no such file).
   utils/fix_data_dir.sh data/train_hires;
 
@@ -52,7 +52,7 @@ if [ $stage -le 1 ]; then
       data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
   steps/compute_cmvn_stats.sh data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
     utils/fix_data_dir.sh data/eval2000_hires  # remove segments with problems
-    
+
   # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
   # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
   # LM training data.   However, they will be in the lexicon, plus speakers
@@ -78,7 +78,7 @@ if [ $stage -le 2 ]; then
   # We need to build a small system just because we need the LDA+MLLT transform
   # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
   # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english 
+  # this decision is based on fisher_english
   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
     --splice-opts "--left-context=3 --right-context=3" \
     5500 90000 data/train_hires_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/nnet2_online/tri3b
@@ -116,7 +116,7 @@ if [ $stage -le 6 ]; then
   # Because we have a lot of data here and we don't want the training to take
   # too long so we reduce the number of epochs from the defaults (15 + 5) to (5
   # + 2), and the (initial,final) learning rate from the defaults (0.04, 0.004)
-  # to (0.01, 0.001). 
+  # to (0.01, 0.001).
   # decided to let others run their jobs too (we only have 10 GPUs on our queue
   # at JHU).  The number of parameters is smaller than the baseline system we had in
   # mind (../nnet2/run_5d_gpu.sh, which had pnorm input/output dim 3000/300 and
@@ -174,7 +174,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for lm_suffix in tg fsh_tgpr; do
     graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
@@ -248,7 +248,7 @@ for x in exp/nnet2_online/nnet_a/decode_eval2000_*; do grep Sum $x/score_*/*sys
 %WER 16.7 | 1831 21395 | 85.4 9.9 4.7 2.1 16.7 54.8 | exp/nnet2_online/nnet_a_online/decode_eval2000_hires_sw1_tg/score_10/eval2000_hires.ctm.swbd.filt.sys
 %WER 22.9 | 4459 42989 | 79.5 13.9 6.6 2.4 22.9 60.3 | exp/nnet2_online/nnet_a_online/decode_eval2000_hires_sw1_fsh_tgpr/score_11/eval2000_hires.ctm.filt.sys
 %WER 23.6 | 4459 42989 | 79.0 14.4 6.7 2.5 23.6 61.1 | exp/nnet2_online/nnet_a_online/decode_eval2000_hires_sw1_tg/score_11/eval2000_hires.ctm.filt.sys
- 
+
  # These are not updated: these results are from systems not using hi-res features
  # Here is the baseline experiment with no iVectors and no CMVN, also tested in batch mode.
  # It's around 1% worse than (with iVectors, batch-mode)
diff --git a/egs/swbd/s5b/local/online/run_nnet2_baseline.sh b/egs/swbd/s5b/local/online/run_nnet2_baseline.sh
index 2256ab2de5e..606ee0c5876 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_baseline.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_baseline.sh
@@ -12,13 +12,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -28,7 +28,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_a_baseline
 fi
 
diff --git a/egs/swbd/s5b/local/online/run_nnet2_fisher.sh b/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
index 2871fd3dfe7..d566034e763 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_fisher.sh
@@ -18,27 +18,27 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet2_online_wsj/nnet_gpu
   trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
   srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
   # the following things are needed while training the combined model.
-  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu 
+  srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
   ivector_src=../../wsj/s5/exp/nnet2_online/extractor
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online_wsj/nnet
   trainfeats=exp/nnet2_online_wsj/wsj_activations_train
   srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
@@ -64,7 +64,7 @@ if [ $stage -le 1 ]; then
     --num-jobs-nnet 4 \
     --mix-up 4000 \
     --initial-learning-rate 0.02 --final-learning-rate 0.004 \
-     $trainfeats/data data/lang exp/tri3b_ali $dir 
+     $trainfeats/data data/lang exp/tri3b_ali $dir
 fi
 
 if [ $stage -le 2 ]; then
@@ -97,10 +97,10 @@ fi
 
 ## From this point on we try something else: we try training all the layers of
 ## the model on this dataset.  First we need to create a combined version of the
-## model. 
+## model.
 if [ $stage -le 5 ]; then
   steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
-  
+
   # Set the learning rate in this initial value to our guess of a suitable value.
   # note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
   # (1.32, 1.38, 7.20, 7.44) with a learning rate of 0.01.
@@ -140,7 +140,7 @@ if [ $stage -le 8 ]; then
     --num-threads "$num_threads" \
     --minibatch-size "$minibatch_size" \
     --parallel-opts "$parallel_opts" \
-     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined 
+     ${dir}_combined_init/final.mdl  ${dir}_combined/egs ${dir}_combined
 fi
 
 if [ $stage -le 9 ]; then
@@ -170,9 +170,9 @@ fi
 exit 0;
 
 # Here are the results when we just retrain the last layer:
-# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
+# grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
 #%WER 1.61 [ 202 / 12533, 22 ins, 46 del, 134 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode/wer_3
-#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 7.99 [ 1002 / 12533, 74 ins, 153 del, 775 sub ] exp/nnet2_online_wsj/nnet_gpu_online/decode_ug/wer_6
 
 # and with per-utterance decoding:
@@ -188,7 +188,7 @@ exit 0;
 # %WER 7.44 [ 932 / 12533, 57 ins, 163 del, 712 sub ] exp/nnet2_online_wsj/nnet_gpu_combined_online/decode_ug_per_utt/wer_8
 
 # And this is a suitable baseline: a system trained on RM only.
-#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode/wer_* | utils/best_wer.sh
 #%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
-#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh 
+#a11:s5: grep WER exp/nnet2_online/nnet_gpu_online/decode_ug/wer_* | utils/best_wer.sh
 #%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
diff --git a/egs/swbd/s5b/local/online/run_nnet2_ms.sh b/egs/swbd/s5b/local/online/run_nnet2_ms.sh
index b2e982ad754..ae6bad59d42 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_ms.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_ms.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -16,13 +16,13 @@ dir=exp/nnet2_online/nnet_ms_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -31,7 +31,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -89,7 +89,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for lm_suffix in tg fsh_tgpr; do
     graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
diff --git a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
index dc56a8371fb..3d6ec456cd8 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_ms_disc.sh
@@ -8,7 +8,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -42,19 +42,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -69,13 +69,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_hires_nodup2 \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_hires_nodup data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_960 data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh b/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh
index 87fcc2eb788..a3b081861ae 100755
--- a/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh
+++ b/egs/swbd/s5b/local/online/run_nnet2_perturb_speed.sh
@@ -7,7 +7,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=0
 train_stage=-10
 use_gpu=true
@@ -20,13 +20,13 @@ common_egs_dir=
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -35,7 +35,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 dir=exp/$nnet2_online/nnet_a
@@ -47,13 +47,13 @@ if [ $stage -le 0 ]; then
     date=$(date +'%m_%d_%H_%M')
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage
   fi
-  
+
   utils/copy_data_dir.sh data/train data/train_hires
   steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
       --cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir;
   steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir;
 
-  # Remove the small number of utterances that couldn't be extracted for some 
+  # Remove the small number of utterances that couldn't be extracted for some
   # reason (e.g. too short; no such file).
   utils/fix_data_dir.sh data/train_hires;
 
@@ -63,7 +63,7 @@ if [ $stage -le 0 ]; then
       data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
   steps/compute_cmvn_stats.sh data/eval2000_hires exp/make_hires/eval2000 $mfccdir;
     utils/fix_data_dir.sh data/eval2000_hires  # remove segments with problems
-    
+
   # Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
   # the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
   # LM training data.   However, they will be in the lexicon, plus speakers
@@ -88,7 +88,7 @@ if [ $stage -le 1 ]; then
   # We need to build a small system just because we need the LDA+MLLT transform
   # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
   # the transform (12th iter is the last), any further training is pointless.
-  # this decision is based on fisher_english 
+  # this decision is based on fisher_english
   steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
     --splice-opts "--left-context=3 --right-context=3" \
     5500 90000 data/train_hires_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/$nnet2_online/tri3b
@@ -116,7 +116,7 @@ if [ $stage -le 4 ]; then
   utils/perturb_data_dir_speed.sh 1.1 data/train_nodup data/temp3
   utils/combine_data.sh data/train_nodup_perturbed data/temp1 data/temp2 data/temp3
   rm -r data/temp1 data/temp2 data/temp3
- 
+
   mfccdir=mfcc_perturbed
   for x in train_nodup_perturbed; do
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
@@ -167,7 +167,7 @@ if [ $stage -le 8 ]; then
   # Because we have a lot of data here and we don't want the training to take
   # too long so we reduce the number of epochs from the defaults (15 + 5) to (5
   # + 2), and the (initial,final) learning rate from the defaults (0.04, 0.004)
-  # to (0.01, 0.001). 
+  # to (0.01, 0.001).
   # decided to let others run their jobs too (we only have 10 GPUs on our queue
   # at JHU).  The number of parameters is smaller than the baseline system we had in
   # mind (../nnet2/run_5d_gpu.sh, which had pnorm input/output dim 3000/300 and
@@ -226,7 +226,7 @@ if [ $stage -le 11 ]; then
 fi
 
 if [ $stage -le 12 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for lm_suffix in tg fsh_tgpr; do
     graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index c8aae0b3b94..f56cbfb8675 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -1,61 +1,155 @@
-#!/bin/bash
 
-echo -n "System               "
-for x in $*; do   printf "% 10s" $x;   done
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer_general.sh tdnn_c_sp tdnn_d_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:1 tdnn_d_sp_smbr:2 ...
+
+echo "# $0 $*";  # print command line.
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+
+echo -n "# System               "
+for x in $*; do   printf " % 9s" $x;   done
 echo
 
-echo -n "WER on train_dev(tg) "
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free name, like:
+#  set_names tdnn_a
+# it will set dir=exp/chain/tdnn_a and epoch_suffix=""
+# If called with something like:
+#  set_names tdnn_d_smbr:3
+# it will set dir=exp/chain/tdnn_d_smbr and epoch_suffix="epoch3"
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  name=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  dirname=exp/chain/$name
+  if [ -z $epoch ]; then
+    epoch_suffix=""
+  else
+    used_epochs=true
+    epoch_suffix=_epoch${epoch}
+  fi
+}
+
+
+echo -n "# WER on train_dev(tg) "
 for x in $*; do
-  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  # note: the '*' in the directory name is because there
+  # is _hires_ in there for the cross-entropy systems, and
+  # nothing for the sequence trained systems.
+  wer=$(grep WER $dirname/decode_train_dev*sw1_tg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep WER $dirname/decode_train_dev*sw1_tg${epoch_suffix}_looped/wer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
 
-echo -n "WER on train_dev(fg) "
+echo -n "# WER on train_dev(fg) "
 for x in $*; do
-  wer=$(grep WER exp/chain/${x}_sp/decode_train_dev_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep WER $dirname/decode_train_dev*sw1_fsh_fg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
-echo -n "WER on eval2000(tg)  "
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep WER $dirname/decode_train_dev*sw1_fsh_fg${epoch_suffix}_looped/wer_* | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
+
+echo -n "# WER on eval2000(tg)  "
 for x in $*; do
-  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
-echo -n "WER on eval2000(fg)  "
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_eval2000*sw1_tg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
+
+echo -n "# WER on eval2000(fg)  "
 for x in $*; do
-  wer=$(grep Sum exp/chain/${x}_sp/decode_eval2000_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
-echo -n "Final train prob     "
+if $include_looped; then
+  echo -n "#           [looped:]  "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_eval2000*sw1_fsh_fg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+fi
+
+
+if $used_epochs; then
+  # we don't print the probs in this case.
+  exit 0
+fi
+
+
+echo -n "# Final train prob     "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
-echo -n "Final valid prob     "
+echo -n "# Final valid prob     "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
-echo -n "Final train prob (xent)    "
+echo -n "# Final train prob (xent)    "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
-echo -n "Final valid prob (xent)    "
+echo -n "# Final valid prob (xent)    "
 for x in $*; do
-  prob=$(grep Overall exp/chain/${x}_sp/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
-  printf "% 10s" $prob
+  prob=$(grep Overall exp/chain/${x}/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 10.4f" $prob
 done
 echo
diff --git a/egs/swbd/s5c/local/chain/run_blstm.sh b/egs/swbd/s5c/local/chain/run_blstm.sh
index 0160247619f..637c747ee7b 120000
--- a/egs/swbd/s5c/local/chain/run_blstm.sh
+++ b/egs/swbd/s5c/local/chain/run_blstm.sh
@@ -1 +1 @@
-tuning/run_blstm_6j.sh
\ No newline at end of file
+tuning/run_blstm_6k.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_blstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_blstm.sh
new file mode 120000
index 00000000000..5a43dcd340e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_blstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_blstm_1a.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
index 9669251c14a..fbc28248491 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1c.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1e.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
index b0264c17d8b..8c46f4b5d07 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6h_discriminative.sh
@@ -8,7 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -45,14 +45,13 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.000000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 
 ## Decode options
@@ -60,8 +59,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -102,7 +101,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
     rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
 
     data_dirs=
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
         $x $train_data_dir exp/shift_hires/ mfcc_hires
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
@@ -110,7 +109,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
     done
     utils/combine_data.sh ${train_data_dir}_fs $data_dirs
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       rm -r ${train_data_dir}_fs$x
     done
   fi
@@ -119,9 +118,9 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
   affix=_fs
 fi
-    
+
 rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
-for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
   awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
 done
 online_ivector_dir=${online_ivector_dir}_fs
@@ -140,7 +139,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats${affix}
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -154,16 +153,13 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs${affix}
@@ -176,16 +172,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -198,7 +191,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" --use-frame-shift false \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --adjust-priors false \
     --modify-learning-rates false \
       ${degs_dir} $dir ;
 fi
@@ -210,7 +203,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
@@ -235,4 +228,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
new file mode 100755
index 00000000000..1e673f8e01a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_blstm_6k.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+# 6k is same as 6j, but with the fast lstm layers
+
+# local/chain/compare_wer_general.sh blstm_6j_sp blstm_6k_sp
+# System                blstm_6j_sp blstm_6k_sp
+# WER on train_dev(tg)      13.80     13.25
+# WER on train_dev(fg)      12.64     12.27
+# WER on eval2000(tg)        15.6      15.7
+# WER on eval2000(fg)        14.2      14.5
+# Final train prob         -0.055    -0.052
+# Final valid prob         -0.077    -0.080
+# Final train prob (xent)        -0.777    -0.743
+# Final valid prob (xent)       -0.9126   -0.8816
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/blstm_6k  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
new file mode 100755
index 00000000000..b9b7152dcbe
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_lstm_6k.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+
+# Copyright 2015  Johns Hopkins University (Author: Daniel Povey).
+#           2015  Vijayaditya Peddinti
+#           2015  Xingyu Na
+#           2015  Pegah Ghahrmani
+#           2017  Google  Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
+
+
+# run_lstm_6k.sh is like run_lstm_6j.sh but making
+# various kaldi-5.1-related upgrades to the script.
+# For the list of changes compare tuning/run_tdnn_lstm_1{c,d}.sh
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/lstm_6k # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  # Note : The delay variable will be used just in the init.config.
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat delay=$label_delay
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
index 85afa7bf9ca..25c6841c0a9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_6h_discriminative.sh
@@ -78,14 +78,13 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.000000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts="--xent-regularize=0.1 --l2-regularize=0.00005"          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
 
 ## Decode options
@@ -93,8 +92,8 @@ decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we deci
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -129,7 +128,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
     rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
 
     data_dirs=
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       steps/shift_feats.sh --cmd "$train_cmd --max-jobs-run 40" --nj 350 \
         $x $train_data_dir exp/shift_hires/ mfcc_hires
       utils/fix_data_dir.sh ${train_data_dir}_fs$x
@@ -137,7 +136,7 @@ if [ $frame_subsampling_factor -ne 1 ]; then
       awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
     done
     utils/combine_data.sh ${train_data_dir}_fs $data_dirs
-    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+    for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
       rm -r ${train_data_dir}_fs$x
     done
   fi
@@ -146,9 +145,9 @@ if [ $frame_subsampling_factor -ne 1 ]; then
 
   affix=_fs
 fi
-    
+
 rm ${online_ivector_dir}_fs/ivector_online.scp 2>/dev/null || true
-for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do 
+for x in `seq -$[frame_subsampling_factor/2] $[frame_subsampling_factor/2]`; do
   awk -v nfs=$x '{print "fs"nfs"-"$0}' $online_ivector_dir/ivector_online.scp >> ${online_ivector_dir}_fs/ivector_online.scp
 done
 online_ivector_dir=${online_ivector_dir}_fs
@@ -167,7 +166,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats${affix}
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -181,16 +180,13 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
 
-valid_left_context=$[valid_left_context + frames_per_eg]
-valid_right_context=$[valid_right_context + frames_per_eg]
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs${affix}
@@ -203,16 +199,13 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
       --adjust-priors false --acwt 1.0 \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
-      --valid-left-context $valid_left_context --valid-right-context $valid_right_context \
-      --priors-left-context $valid_left_context --priors-right-context $valid_right_context $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      $frame_subsampling_opt \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir $lang ${srcdir}_ali${affix} $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -225,7 +218,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" --use-frame-shift false \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors false \
+    --adjust-priors false \
     --modify-learning-rates false \
       ${degs_dir} $dir ;
 fi
@@ -237,7 +230,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
@@ -262,4 +255,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
index 59bc2c64f70..9dfaa1d4509 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7h.sh
@@ -9,7 +9,14 @@
 #Final valid prob      -0.110475 -0.113102
 #Final train prob (xent)      -1.20065   -1.2533
 #Final valid prob (xent)       -1.3313  -1.36743
-#
+
+# Online decoding
+# System                tdnn_7h_sp tdnn_7h_sp_online
+# WER on train_dev(tg)      13.96     13.95
+# WER on train_dev(fg)      12.86     12.82
+# WER on eval2000(tg)        16.5      16.5
+# WER on eval2000(fg)        14.8      14.8
+
 set -e
 
 # configs for 'chain'
@@ -20,6 +27,7 @@ get_egs_stage=-10
 speed_perturb=true
 dir=exp/chain/tdnn_7h  # Note: _sp will get added to this if $speed_perturb == true.
 decode_iter=
+decode_nj=50
 
 # training options
 num_epochs=4
@@ -36,6 +44,8 @@ remove_egs=false
 common_egs_dir=
 xent_regularize=0.1
 
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
@@ -193,26 +203,65 @@ if [ $stage -le 14 ]; then
   utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
 fi
 
-decode_suff=sw1_tg
+
 graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
 if [ $stage -le 15 ]; then
-  iter_opts=
-  if [ ! -z $decode_iter ]; then
-    iter_opts=" --iter $decode_iter "
-  fi
+  rm $dir/.error 2>/dev/null || true
   for decode_set in train_dev eval2000; do
       (
       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
           --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
       if $has_fisher; then
           steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
             data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
             $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
       fi
-      ) &
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
   done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
 fi
-wait;
+
+
 exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
index 9aec95393d1..793b40f7fe3 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7j.sh
@@ -126,12 +126,12 @@ if [ $stage -le 12 ]; then
 
   # the first splicing is moved before the lda layer, so no splicing here
   relu-renorm-layer name=tdnn1 dim=768
-  tdnn-relu-renorm-layer name=tdnn2 splice-indexes=-1,0,1 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn3 splice-indexes=-1,0,1 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn4 splice-indexes=-3,0,3 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn5 splice-indexes=-3,0,3 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn6 splice-indexes=-3,0,3 dim=768 subset-dim=384
-  tdnn-relu-renorm-layer name=tdnn7 splice-indexes=-3,0,3 dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=768 subset-dim=384
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=768 subset-dim=384
 
   ## adding the layers for chain branch
   relu-renorm-layer name=prefinal-chain input=tdnn7 dim=768 target-rms=0.5
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
new file mode 100755
index 00000000000..12b63b7e96a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1a.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# tdnn_blstm_1a is same as blstm_6k, but with the initial tdnn layers
+# blstm_6k : num-parameters: 41155430
+# tdnn_blstm_1a : num-parameters: 53688166
+
+# local/chain/compare_wer_general.sh blstm_6l_sp blstm_6k_sp
+# System                blstm_6k_sp tdnn_blstm_1a_sp
+# WER on train_dev(tg)      13.25     12.95
+# WER on train_dev(fg)      12.27     11.98
+# WER on eval2000(tg)        15.7      15.5
+# WER on eval2000(fg)        14.5      14.1
+# Final train prob         -0.052    -0.041
+# Final valid prob         -0.080    -0.072
+# Final train prob (xent)        -0.743    -0.629
+# Final valid prob (xent)       -0.8816   -0.8091
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_blstm_1a  # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+chunk_width=150
+chunk_left_context=40
+chunk_right_context=40
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=0
+
+# decode options
+extra_left_context=50
+extra_right_context=50
+frames_per_chunk=
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=$dir${affix:+_$affix}
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=blstm1-forward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm1-backward input=tdnn3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64 \
+    --trainer.frames-per-iter 1200000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $chunk_width \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --frames-per-chunk "$frames_per_chunk" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
old mode 100644
new mode 100755
index b305c57b6ab..d71301eb102
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -5,15 +5,19 @@
 # it's faster.  See PR #1243 on github, and issue #1237.
 # This used to be called run_tdnn_fastlstm_1b.sh.
 
-#System               tdnn_lstm_1a_ld5 tdnn_lstm_1b_ld5 tdnn_lstm_1c_ld5
-#WER on train_dev(tg)      13.42           13.00             12.91
-#WER on train_dev(fg)      12.42           12.03             11.98
-#WER on eval2000(tg)        15.7           15.3              15.2
-#WER on eval2000(fg)        14.2           13.9              13.8
-#Final train prob     -0.0538088      -0.056294            -0.050
-#Final valid prob     -0.0800484      -0.0813322           -0.092
-#Final train prob (xent)   -0.7603    -0.777787            -0.756
-#Final valid prob (xent)   -0.949909  -0.939146            -0.983
+## note: the last column below was this run on Feb 1 2017, in the
+## shortcut branch.  Results are a bit worse, but I believe this is just
+## random noise or a little bit of mean-regression.
+
+#System               tdnn_lstm_1a_ld5_sp tdnn_lstm_1b_ld5_sp tdnn_lstm_1c_ld5_sp tdnn_lstm_1c_ld5_sp
+#WER on train_dev(tg)      13.42           13.00             12.91         13.17
+#WER on train_dev(fg)      12.42           12.03             11.98         12.25
+#WER on eval2000(tg)        15.7           15.3              15.2          15.4
+#WER on eval2000(fg)        14.2           13.9              13.8          14.1
+#Final train prob     -0.0538088      -0.056294            -0.050          -0.046
+#Final valid prob     -0.0800484      -0.0813322           -0.092          -0.073
+#Final train prob (xent)   -0.7603    -0.777787            -0.756          -0.749
+#Final valid prob (xent)   -0.949909  -0.939146            -0.983          -0.980
 
 set -e
 
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
new file mode 100755
index 00000000000..22c7d2e582d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_1d.sh is like run_tdnn_lstm_1c.sh but making
+# various kaldi-5.1-related upgrades to the script:
+#  change chunk-width to be variable, add extra_left_context_initial=0
+# and extra_right_context_final=0; add looped decoding.
+# Also changed frames-per-iter from 1.2 million to 1.5 million... this
+# might have been a mistake, trying 1 million in 1f to see if this matters.
+
+# The comparison below is with a version of the 1c system that was run at about
+# the same time.  The degradation in log-likelihood and xent prob is likely because
+# now on average the chunk-size is slightly smaller than before (150 -> 136);
+# possibly the change in extra-(left,right) context has a similar effect
+# (or maybe it's just because the validation and train-subset examples have changed).
+
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1c_ld5_sp tdnn_lstm_1d_sp
+# System                tdnn_lstm_1c_ld5_sp tdnn_lstm_1d_sp
+# WER on train_dev(tg)      13.17     12.90
+#           [looped:]                 13.01
+# WER on train_dev(fg)      12.25     11.90
+#           [looped:]                 12.13
+# WER on eval2000(tg)        15.4      15.7
+#           [looped:]                  15.7
+# WER on eval2000(fg)        14.1      14.2
+#           [looped:]                  14.4
+# Final train prob         -0.046    -0.064
+# Final valid prob         -0.073    -0.088
+# Final train prob (xent)        -0.749    -0.836
+# Final valid prob (xent)       -0.9084   -0.9631
+
+# run_tdnn_lstm_1c.sh is like run_tdnn_lstm_1b.sh but using the
+# new 'fast-lstm' layer.  Results are slightly improved, plus
+# it's faster.  See PR #1243 on github, and issue #1237.
+# This used to be called run_tdnn_fastlstm_1b.sh.
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1d # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..6987757757a
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+# There seems to be no consistent difference in WER.  Inconclusive.
+# However I may keep 0.01 just for consistency with other setups.
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1d_sp tdnn_lstm_1e_sp
+# System                tdnn_lstm_1d_sp tdnn_lstm_1e_sp
+# WER on train_dev(tg)      12.90     12.74
+#           [looped:]       13.01     12.93
+# WER on train_dev(fg)      11.90     11.70
+#           [looped:]       12.13     12.09
+# WER on eval2000(tg)        15.7      15.7
+#           [looped:]        15.7      15.9
+# WER on eval2000(fg)        14.2      14.3
+#           [looped:]        14.4      14.6
+# Final train prob         -0.064    -0.066
+# Final valid prob         -0.088    -0.087
+# Final train prob (xent)        -0.836    -0.931
+# Final valid prob (xent)       -0.9631   -1.0279
+
+# Online decoding
+# System                tdnn_lstm_1e_sp_online tdnn_lstm_1e_sp
+# WER on train_dev(tg)      12.93     12.74
+# WER on train_dev(fg)      12.05     11.87
+# WER on eval2000(tg)        15.5      15.4
+# WER on eval2000(fg)        14.0      13.8
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1e # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
new file mode 100755
index 00000000000..90e179379e4
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1f.sh is like run_tdnn_lstm_1e.sh but
+# reducing the frames-per-iter from 1.5 million to 1 million,
+# since the time per iter was more than usual (about 5 minutes).
+
+# Below, the WER seems to get a little worse, although the optimization
+# is improved slightly.  There seems to be more train/valid difference.
+# see also 1i.
+
+# exp/chain/tdnn_lstm_1f_sp: num-iters=392 nj=3..16 num-params=39.6M dim=40+100->6042 combine=-0.080->-0.073 xent:train/valid[260,391,final]=(-1.06,-0.903,-0.916/-1.13,-1.03,-1.04) logprob:train/valid[260,391,final]=(-0.084,-0.064,-0.065/-0.100,-0.091,-0.090)
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# WER on train_dev(tg)      12.74     13.23
+#           [looped:]       12.93     13.27
+# WER on train_dev(fg)      11.70     12.17
+#           [looped:]       12.09     12.42
+# WER on eval2000(tg)        15.7      16.1
+#           [looped:]        15.9      16.2
+# WER on eval2000(fg)        14.3      14.6
+#           [looped:]        14.6      14.7
+# Final train prob         -0.066    -0.065
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.916
+# Final valid prob (xent)       -1.0279   -1.0359
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+# WER is worse but this seems to be due to more complete optimization
+# (train better, valid worse).  Looks like we may be overtraining.
+#
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# WER on train_dev(tg)      12.74     13.23
+#           [looped:]       12.93     13.27
+# WER on train_dev(fg)      11.70     12.17
+#           [looped:]       12.09     12.42
+# WER on eval2000(tg)        15.7      16.1
+#           [looped:]        15.9      16.2
+# WER on eval2000(fg)        14.3      14.6
+#           [looped:]        14.6      14.7
+# Final train prob         -0.066    -0.065
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.916
+# Final valid prob (xent)       -1.0279   -1.0359
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1f # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b1{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1000000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
new file mode 100755
index 00000000000..cb73f020e3e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+
+# 1g is like 1e, but reducing decay-time from 20 to 15, to see if
+# it reduces the difference between regular and looped decoding.
+#
+# There doesn't seem to be a very consistent difference betwen 1e and 1g.
+
+
+# exp/chain/tdnn_lstm_1g_sp: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6042 combine=-0.083->-0.076 xent:train/valid[173,261,final]=(-1.09,-0.929,-0.938/-1.15,-1.04,-1.05) logprob:train/valid[173,261,final]=(-0.089,-0.066,-0.067/-0.102,-0.089,-0.090)
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1g_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1g_sp
+# WER on train_dev(tg)      12.74     13.03
+#           [looped:]       12.93     12.98
+# WER on train_dev(fg)      11.70     12.02
+#           [looped:]       12.09     12.13
+# WER on eval2000(tg)        15.7      15.6
+#           [looped:]        15.9      15.9
+# WER on eval2000(fg)        14.3      14.1
+#           [looped:]        14.6      14.4
+# Final train prob         -0.066    -0.067
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.938
+# Final valid prob (xent)       -1.0279   -1.0473
+
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1g # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=15"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
new file mode 100755
index 00000000000..b12be22ce3d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+# 1h is like 1e, but reducing the hidden-dims from 1024 to 880.
+
+# Does not seem to help; both train and valid probs get worse by about
+# the same amount, and WER is overall just slightly worse.  Maybe 1024
+# was approximately optimal.
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1h_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1h_sp
+# WER on train_dev(tg)      12.74     13.06
+#           [looped:]       12.93     13.17
+# WER on train_dev(fg)      11.70     12.13
+#           [looped:]       12.09     12.27
+# WER on eval2000(tg)        15.7      15.7
+#           [looped:]        15.9      15.9
+# WER on eval2000(fg)        14.3      14.4
+#           [looped:]        14.6      14.5
+# Final train prob         -0.066    -0.069
+# Final valid prob         -0.087    -0.091
+# Final train prob (xent)        -0.931    -0.967
+# Final valid prob (xent)       -1.0279   -1.0631
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1h # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=exp/chain/tdnn_lstm_1d_sp/egs
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=880
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=880
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=880
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=880 recurrent-projection-dim=220 non-recurrent-projection-dim=220 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=880
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=880
+  fast-lstmp-layer name=fastlstm2 cell-dim=880 recurrent-projection-dim=220 non-recurrent-projection-dim=220 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=880
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=880
+  fast-lstmp-layer name=fastlstm3 cell-dim=880 recurrent-projection-dim=220 non-recurrent-projection-dim=220 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
new file mode 100755
index 00000000000..7e05834c1fb
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -0,0 +1,300 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1i.sh is like run_tdnn_lstm_1{e,f}.sh but
+# with a different frames-per-iter: 2 million, vs. 1.5 million
+# (1e) and 1 million (1f)
+
+# Results are inconclusive regarding comparison with 1e: it's [0.3 worse, 0.1
+# better, 0.2 worse, same, 0.2 better, 0.2 better, 0.3 better, 0.3 better] on
+# the different conditions.  There is less train/valid difference and worse
+# train prob [the trends of valid and train probs are consistent as we change
+# the frames-per-iter].
+
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1{e,f,i}_sp 2>/dev/null
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp tdnn_lstm_1i_sp
+# WER on train_dev(tg)      12.74     13.23     13.08
+#           [looped:]       12.93     13.27     13.00
+# WER on train_dev(fg)      11.70     12.17     11.97
+#           [looped:]       12.09     12.42     12.08
+# WER on eval2000(tg)        15.7      16.1      15.5
+#           [looped:]        15.9      16.2      15.7
+# WER on eval2000(fg)        14.3      14.6      14.0
+#           [looped:]        14.6      14.7      14.3
+# Final train prob         -0.066    -0.065    -0.069
+# Final valid prob         -0.087    -0.090    -0.088
+# Final train prob (xent)        -0.931    -0.916    -0.947
+# Final valid prob (xent)       -1.0279   -1.0359   -1.0419
+
+# run_tdnn_lstm_1e.sh is like run_tdnn_lstm_1d.sh but
+# trying the change of xent_regularize from 0.025 (which was an
+# unusual value) to the more usual 0.01.
+
+# WER is worse but this seems to be due to more complete optimization
+# (train better, valid worse).  Looks like we may be overtraining.
+#
+# local/chain/compare_wer_general.sh --looped tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# System                tdnn_lstm_1e_sp tdnn_lstm_1f_sp
+# WER on train_dev(tg)      12.74     13.23
+#           [looped:]       12.93     13.27
+# WER on train_dev(fg)      11.70     12.17
+#           [looped:]       12.09     12.42
+# WER on eval2000(tg)        15.7      16.1
+#           [looped:]        15.9      16.2
+# WER on eval2000(fg)        14.3      14.6
+#           [looped:]        14.6      14.7
+# Final train prob         -0.066    -0.065
+# Final valid prob         -0.087    -0.090
+# Final train prob (xent)        -0.931    -0.916
+# Final valid prob (xent)       -1.0279   -1.0359
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1i # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=final
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b1{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 2000000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" --iter $decode_iter \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj 50 --cmd "$decode_cmd" --iter $decode_iter \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
new file mode 100755
index 00000000000..6a6a4ba30e1
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# same as 1e but with delay of -1
+# System                tdnn_lstm_1e_sp tdnn_lstm_1j_sp
+# WER on train_dev(tg)  12.74           12.95
+# WER on train_dev(fg)  11.70           12.01
+# WER on eval2000(tg)   15.7            15.3
+# WER on eval2000(fg)   14.3            13.9
+# Final train prob      -0.066          -0.066
+# Final valid prob      -0.087          -0.089
+# Final train prob (xent) -0.931        -0.921
+# Final valid prob (xent) -1.0279      -1.0363
+# exp/chain/tdnn_lstm_1j_sp/: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6067 combine=-0.076->-0.074 xent:train/valid[173,261,final]=(-1.08,-0.925,-0.921/-1.17,-1.04,-1.04) logprob:train/valid[173,261,final]=(-0.085,-0.067,-0.066/-0.103,-0.090,-0.089)
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1j # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_nj=50
+
+# training options
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+
+chunk_left_context=40
+chunk_right_context=0
+# we'll put chunk-left-context-initial=0 and chunk-right-context-final=0
+# directly without variables.
+frames_per_chunk=140,100,160
+
+# (non-looped) decoding options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+# we'll put extra-left-context-initial=0 and extra-right-context-final=0
+# directly without variables.
+
+
+remove_egs=false
+common_egs_dir=
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; }
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=20"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-1 $lstm_opts
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+      (
+        steps/nnet3/decode.sh --num-threads 4 \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 25 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # looped decoding.  Note: this does not make sense for BLSTMs or other
+  # backward-recurrent setups, and for TDNNs and other non-recurrent there is no
+  # point doing it because it would give identical results to regular decoding.
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      steps/nnet3/decode_looped.sh \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg_looped || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg}_looped || exit 1;
+      fi
+      ) &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in looped decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/swbd/s5c/local/eval1997_data_prep.sh b/egs/swbd/s5c/local/eval1997_data_prep.sh
index f49ac551192..e29da13deee 100755
--- a/egs/swbd/s5c/local/eval1997_data_prep.sh
+++ b/egs/swbd/s5c/local/eval1997_data_prep.sh
@@ -5,13 +5,13 @@
 
 # To be run from one directory above this script.
 
-# The input is a directory name containing the 1997 Hub5 english evaluation 
+# The input is a directory name containing the 1997 Hub5 english evaluation
 # test set and transcripts, which is LDC2002S10
 # e.g. see
 # http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002S10
 #
 # It is assumed that the transcripts are in a subdirectory called transcr
-# However, we download the STM from NIST site: 
+# However, we download the STM from NIST site:
 # ftp://jaguar.ncsl.nist.gov/lvcsr/mar97/eval/hub5e97.english.980618.stm
 
 if [ $# -ne 1 ]; then
@@ -26,7 +26,7 @@ sdir=$1
 [ ! -d $sdir/transcr ] \
   && echo Expecting directory $sdir/transcr to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/eval1997
 mkdir -p $dir
@@ -40,7 +40,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -49,8 +49,8 @@ awk -v sph2pipe=$sph2pipe '{
 # segments file format is: utt-id side-id start-time end-time, e.g.:
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 pem=$sdir/speech/97_hub5e.pem
-[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 # There is one line in the 97_hub5e.pem with an extra : on the channel
 # sw_10022 B: unknown_speaker 281.21 284.37 -- the : is removed
@@ -64,7 +64,7 @@ grep -v ';;' $pem | sed -e 's?:??g' \
            printf "%s %s %.2f %.2f\n", utt, spk, start, end; }' \
   | sort -u > $dir/segments
 
- 
+
 # Download the STM and GLM files:
 ( cd $dir
   rm -f stm glm
@@ -78,9 +78,9 @@ grep -v ';;' $pem | sed -e 's?:??g' \
 
 
 # stm file has lines like:
-# en_4042 A en_4042_A 227.71 232.26 <O>  BEANS RIGHT THAT IS WHY I SAID BEANS 
-# One of the segments (sw_10022-B_028120-028437) is removed since it is not 
-# scored and does not show up in the pem file. 
+# en_4042 A en_4042_A 227.71 232.26 <O>  BEANS RIGHT THAT IS WHY I SAID BEANS
+# One of the segments (sw_10022-B_028120-028437) is removed since it is not
+# scored and does not show up in the pem file.
 grep -v ';;' $dir/hub5e97.english.980618.stm \
   | awk '{
            spk=$1"-"$2;
@@ -96,7 +96,7 @@ grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
 
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -116,4 +116,3 @@ done
 
 echo Data preparation and formatting completed for Eval 2000
 echo "(but not MFCC extraction)"
-
diff --git a/egs/swbd/s5c/local/eval2000_data_prep.sh b/egs/swbd/s5c/local/eval2000_data_prep.sh
index 8d7e1f7ed6e..4c34061a120 100755
--- a/egs/swbd/s5c/local/eval2000_data_prep.sh
+++ b/egs/swbd/s5c/local/eval2000_data_prep.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-# Hub-5 Eval 2000 data preparation 
+# Hub-5 Eval 2000 data preparation
 # Author:  Arnab Ghoshal (Jan 2013)
 
 # To be run from one directory above this script.
 
-# The input is two directory names (possibly the same) containing the 
+# The input is two directory names (possibly the same) containing the
 # 2000 Hub5 english evaluation test set and transcripts, which are
 # respectively: LDC2002S09  LDC2002T43
 # e.g. see
@@ -35,7 +35,7 @@ tdir=$2
 [ ! -d $tdir/reference ] \
   && echo Expecting directory $tdir/reference to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/eval2000
 mkdir -p $dir
@@ -49,7 +49,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -58,8 +58,8 @@ awk -v sph2pipe=$sph2pipe '{
 # segments file format is: utt-id side-id start-time end-time, e.g.:
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 pem=$sdir/english/hub5e_00.pem
-[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+[ ! -f $pem ] && echo "$0: No such file $pem" && exit 1;
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 # we ignore the warnings below for now, although they seem to indicate some problems
@@ -72,7 +72,7 @@ grep -v ';;' $pem \
   | sort -u | local/extend_segments.pl 0.1 > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -94,10 +94,10 @@ cp $tdir/reference/en20000405_hub5.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
diff --git a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
old mode 100644
new mode 100755
index 11742173120..7cf42c9ae04
--- a/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/nnet3/compare_wer_general.sh
@@ -1,48 +1,99 @@
 #!/bin/bash
 
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer_general.sh tdnn_c_sp tdnn_d_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:1 tdnn_d_sp_smbr:2 ...
+
+echo "# $0 $*";  # print command line.
+
+
 echo -n "# System               "
-for x in $*; do   printf "% 10s" $x;   done
+for x in $*; do   printf " % 9s" $x;   done
 echo
 
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free name, like:
+#  set_names tdnn_a_sp
+# it will set dir=exp/nnet3/tdnn_a_sp and epoch_suffix=""
+# If called with something like:
+#  set_names tdnn_d_sp_smbr:3
+# it will set dir=exp/nnet3/tdnn_d_sp_smbr and epoch_suffix="epoch3"
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  name=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  dirname=exp/nnet3/$name
+  if [ -z $epoch ]; then
+    epoch_suffix=""
+  else
+    used_epochs=true
+    epoch_suffix=_epoch${epoch}
+  fi
+}
+
+
 echo -n "# WER on train_dev(tg) "
 for x in $*; do
-  wer=$(grep WER exp/nnet3/${x}_sp/decode_train_dev_hires_sw1_tg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  # note: the '*' in the directory name is because there
+  # is _hires_ in there for the cross-entropy systems, and
+  # nothing for the sequence trained systems.
+  wer=$(grep WER $dirname/decode_train_dev*sw1_tg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
 echo -n "# WER on train_dev(fg) "
 for x in $*; do
-  wer=$(grep WER exp/nnet3/${x}_sp/decode_train_dev_hires_sw1_fsh_fg/wer_* | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep WER $dirname/decode_train_dev*sw1_fsh_fg$epoch_suffix/wer_* | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
 echo -n "# WER on eval2000(tg)  "
 for x in $*; do
-  wer=$(grep Sum exp/nnet3/${x}_sp/decode_eval2000_hires_sw1_tg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
 echo -n "# WER on eval2000(fg)  "
 for x in $*; do
-  wer=$(grep Sum exp/nnet3/${x}_sp/decode_eval2000_hires_sw1_fsh_fg/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+  set_names $x
+  wer=$(grep Sum $dirname/decode_eval2000*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
   printf "% 10s" $wer
 done
 echo
 
+if $used_epochs; then
+  # we don't print the probs in this case.
+  exit 0
+fi
+
 echo -n "# Final train prob     "
 for x in $*; do
-  prob=$(grep log-likelihood exp/nnet3/${x}_sp/log/compute_prob_train.combined.log | awk '{print $8}')
-  printf "% 10s" $prob
+  set_names $x
+  prob=$(grep log-likelihood $dirname/log/compute_prob_train.combined.log | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
 
 echo -n "# Final valid prob     "
 for x in $*; do
-  prob=$(grep log-likelihood exp/nnet3/${x}_sp/log/compute_prob_valid.combined.log | awk '{print $8}')
-  printf "% 10s" $prob
+  set_names $x
+  prob=$(grep log-likelihood $dirname/log/compute_prob_valid.combined.log | awk '{print $8}')
+  printf "% 10.3f" $prob
 done
 echo
-
diff --git a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
index 99f6a31e708..ba751ad8732 100755
--- a/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
+++ b/egs/swbd/s5c/local/nnet3/run_blstm_discriminative.sh
@@ -2,7 +2,9 @@
 
 set -o pipefail
 set -e
-# this is run_discriminative.sh
+
+# Caution: this script is out of date, it does not use the
+# refactored discriminative training script with get_degs.sh.
 
 # This script does discriminative training on top of CE BLSTM system.
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
@@ -44,7 +46,6 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
@@ -53,10 +54,6 @@ num_jobs_nnet=4
 num_epochs=4
 regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false
-                              # because it does not help in some setups
-modify_learning_rates=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
@@ -138,15 +135,12 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -159,8 +153,6 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
     ${degs_dir} $dir
 fi
 
@@ -170,7 +162,7 @@ if [ $stage -le 5 ]; then
     for decode_set in train_dev eval2000; do
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
+      iter=epoch${x}_adj
 
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} $context_opts \
@@ -195,4 +187,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
index 109396ed72e..b64d3e468df 100755
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -13,6 +13,9 @@ speed_perturb=true
 mkdir -p nnet3
 # perturbed data preparation
 train_set=train_nodup
+
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
 if [ "$speed_perturb" == "true" ]; then
   if [ $stage -le 1 ]; then
     #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
@@ -59,18 +62,7 @@ if [ $stage -le 3 ]; then
   for dataset in $train_set train_100k_nodup; do
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
 
-    # scale the waveforms, this is useful as we don't use CMVN
-    data_dir=data/${dataset}_hires
-    cat $data_dir/wav.scp | python -c "
-import sys, os, subprocess, re, random
-scale_low = 1.0/8
-scale_high = 2.0
-for line in sys.stdin.readlines():
-  if len(line.strip()) == 0:
-    continue
-  print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
-"| sort -k1,1 -u  > $data_dir/wav.scp_scaled || exit 1;
-    mv $data_dir/wav.scp_scaled $data_dir/wav.scp
+    utils/data/perturb_data_dir_volume.sh data/${dataset}_hires
 
     steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
         --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir;
@@ -81,7 +73,7 @@ for line in sys.stdin.readlines():
     utils/fix_data_dir.sh data/${dataset}_hires;
   done
 
-  for dataset in eval2000 train_dev rt03; do
+  for dataset in eval2000 train_dev $maybe_rt03; do
     # Create MFCCs for the eval set
     utils/copy_data_dir.sh data/$dataset data/${dataset}_hires
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \
@@ -128,12 +120,12 @@ if [ $stage -le 8 ]; then
 
   # having a larger number of speakers is helpful for generalization, and to
   # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
     data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1;
 
-  for data_set in eval2000 train_dev rt03; do
+  for data_set in eval2000 train_dev $maybe_rt03; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
       data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1;
   done
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh
new file mode 120000
index 00000000000..e4d47deb7a4
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_d_disc.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
deleted file mode 100755
index f422aa92e38..00000000000
--- a/egs/swbd/s5c/local/nnet3/run_tdnn_discriminative.sh
+++ /dev/null
@@ -1,186 +0,0 @@
-#!/bin/bash
-
-set -o pipefail
-set -e
-# this is run_discriminative.sh
-
-# This script does discriminative training on top of CE nnet3 system.
-# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
-# since the lattice generation runs in about real-time, so takes of the order of
-# 1000 hours of CPU time.
-# 
-. cmd.sh
-
-
-stage=0
-train_stage=-10 # can be used to start training in the middle.
-get_egs_stage=-10
-use_gpu=true  # for training
-cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like denlats,
-               # alignments and degs).
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-srcdir=exp/nnet3/nnet_ms_a
-train_data_dir=data/train_nodup_sp_hires
-online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
-degs_dir=                     # If provided, will skip the degs directory creation
-lats_dir=                     # If provided, will skip denlats creation
-
-## Objective options
-criterion=smbr
-one_silence_class=true
-
-dir=${srcdir}_${criterion}
-
-## Egs options
-frames_per_eg=150
-frames_overlap_per_eg=30
-truncate_deriv_weights=10
-
-## Nnet training options
-effective_learning_rate=0.0000125
-max_param_change=1
-num_jobs_nnet=4
-num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
-minibatch_size=64
-adjust_priors=true            # May need to be set to false 
-                              # because it does not help in some setups
-modify_learning_rates=true
-last_layer_factor=0.1
-
-## Decode options
-decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
-
-if $use_gpu; then
-  if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
-EOF
-  fi
-  num_threads=1
-else
-  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
-  # almost the same, but this may be a little bit slow.
-  num_threads=16
-fi
-
-if [ ! -f ${srcdir}/final.mdl ]; then
-  echo "$0: expected ${srcdir}/final.mdl to exist; first run run_tdnn.sh or run_lstm.sh"
-  exit 1;
-fi
-
-if [ $stage -le 1 ]; then
-  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
-  # get excellent GPU utilization though.]
-  nj=350 # have a high number of jobs because this could take a while, and we might
-         # have some stragglers.
-  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
-    --online-ivector-dir $online_ivector_dir \
-     --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
-
-fi
-
-if [ -z "$lats_dir" ]; then
-  lats_dir=${srcdir}_denlats
-  if [ $stage -le 2 ]; then
-    nj=50  
-    # this doesn't really affect anything strongly, except the num-jobs for one of
-    # the phases of get_egs_discriminative.sh below.
-    num_threads_denlats=6
-    subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
-    # total slots = 80 * 6 = 480.
-    steps/nnet3/make_denlats.sh --cmd "$decode_cmd" --determinize true \
-      --online-ivector-dir $online_ivector_dir \
-      --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
-      $train_data_dir data/lang $srcdir ${lats_dir} ;
-  fi
-fi
-
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
-
-left_context=$[model_left_context + extra_left_context]
-right_context=$[model_right_context + extra_right_context]
-
-frame_subsampling_opt=
-if [ -f $srcdir/frame_subsampling_factor ]; then
-  frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
-fi
-
-cmvn_opts=`cat $srcdir/cmvn_opts` 
-
-if [ -z "$degs_dir" ]; then
-  degs_dir=${srcdir}_degs
-
-  if [ $stage -le 3 ]; then
-    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
-      utils/create_split_dir.pl \
-        /export/b0{1,2,12,13}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
-    fi
-    # have a higher maximum num-jobs if
-    if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
-
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
-    steps/nnet3/get_egs_discriminative.sh \
-      --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
-      --online-ivector-dir $online_ivector_dir \
-      --left-context $left_context --right-context $right_context \
-      $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
-      $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
-  fi
-fi
-
-if [ $stage -le 4 ]; then
-  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
-    --stage $train_stage \
-    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
-    --criterion $criterion --drop-frames true \
-    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
-    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
-    --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
-fi
-
-graph_dir=exp/tri4/graph_sw1_tg
-if [ $stage -le 5 ]; then
-  for x in `seq $decode_start_epoch $num_epochs`; do
-    for decode_set in train_dev eval2000; do
-      (
-      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
-      iter=epoch$x.adj
-      
-      steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
-        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-        $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_$iter ;
-      if $has_fisher; then
-        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-          $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_$iter ;
-      fi
-      ) &
-    done
-  done
-fi
-wait;
-
-if [ $stage -le 6 ] && $cleanup; then
-  # if you run with "--cleanup true --stage 6" you can clean up.
-  rm ${lats_dir}/lat.*.gz || true
-  rm ${srcdir}_ali/ali.*.gz || true
-  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
-fi
-
-
-exit 0;
-
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh
new file mode 120000
index 00000000000..bff3b4164f7
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lfr1c.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh
new file mode 120000
index 00000000000..3fef5cbd9fe
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/run_tdnn_lfr_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lfr1c_disc.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
old mode 100644
new mode 100755
index df02fec38fd..b4b60688cdb
--- a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d.sh
@@ -6,13 +6,15 @@
 # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
 # --num-threads 16 and --minibatch-size 128.
 
-# System                  tdnn_c   tdnn_d
-# WER on train_dev(tg)      17.37     16.72
-# WER on train_dev(fg)      15.94     15.31
-# WER on eval2000(tg)        20.0      19.2
-# WER on eval2000(fg)        18.2      17.8
-# Final train prob       -1.43781  -1.22859
-# Final valid prob       -1.56895    -1.354
+# note: the last column is a version of tdnn_d that was done after the
+# changes for the 5.1 version of Kaldi (variable minibatch-sizes, etc.)
+# System                  tdnn_c   tdnn_d       tdnn_d[repeat]
+# WER on train_dev(tg)      17.37     16.72      16.51
+# WER on train_dev(fg)      15.94     15.31      15.34
+# WER on eval2000(tg)        20.0      19.2        19.2
+# WER on eval2000(fg)        18.2      17.8       17.7
+# Final train prob       -1.43781  -1.22859      -1.22215
+# Final valid prob       -1.56895    -1.354     -1.31647
 
 stage=0
 affix=
@@ -71,7 +73,7 @@ if [ $stage -le 9 ]; then
   relu-renorm-layer name=tdnn4 input=Append(-3,3) dim=1024
   relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
   relu-renorm-layer name=tdnn6 dim=1024
-  
+
   output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
 EOF
 
@@ -125,4 +127,3 @@ if [ $stage -le 11 ]; then
 fi
 wait;
 exit 0;
-
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
new file mode 100755
index 00000000000..6f42e042166
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_d_disc.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the CE nnet3 system
+# from run_tdnn_d.  To simplify things, this assumes you are using the "speed-perturbed" data
+# (--speed_perturb true, which is the default) in the baseline run_tdnn_d.sh script.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+
+
+
+# Below is with the current effective_learning_rate=0.00000125.  This was run
+# with 4 epochs, but the script is currently set to run for 3 epochs, and the
+# 'slow2' affix is removed.
+
+# steps/info/nnet3_disc_dir_info.pl exp/nnet3/tdnn_d_sp_smbrslow2
+# exp/nnet3/tdnn_d_sp_smbrslow2:num-jobs=4;effective-lrate=1.25e-06;iters-per-epoch=194;epoch[0,1,2,3,4]:train-objf=[0.87,0.91,0.91,0.91,0.92],valid-objf=[0.85,0.86,0.87,0.87,0.87],train-counts=[1.27,0.92,0.79,0.72,0.68],valid-counts=[1.11,0.80,0.74,0.67,0.65]
+#
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbrslow2:{1,2,3,4}_adj
+# System                tdnn_d_sp tdnn_d_sp_smbrslow2:1_adj tdnn_d_sp_smbrslow2:2_adj tdnn_d_sp_smbrslow2:3_adj tdnn_d_sp_smbrslow2:4_adj
+# WER on train_dev(tg)      16.51     15.12     15.02     14.89     14.87
+# WER on train_dev(fg)      15.34     13.80     13.64     13.61     13.62
+# WER on eval2000(tg)        19.2      17.8      17.7      17.6      17.8
+# WER on eval2000(fg)        17.7      16.3      16.1      16.2      16.4
+
+# Below is when it was run with learning-rate 0.0000025.  It was best after 2 epochs.
+
+# exp/nnet3/tdnn_d_sp_smbrslow:num-jobs=4;effective-lrate=2.5e-06;iters-per-epoch=194;epoch[0,1,2,3]:train-objf=[0.87,0.91,0.91,0.92],valid-objf=[0.85,0.87,0.87,0.87],train-counts=[1.27,0.80,0.73,0.65],valid-counts=[1.11,0.72,0.65,0.63]
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbrslow:{1,2,3}_adj
+# System                tdnn_d_sp tdnn_d_sp_smbrslow:1_adj tdnn_d_sp_smbrslow:2_adj tdnn_d_sp_smbrslow:3_adj
+# WER on train_dev(tg)      16.51     15.01     14.89     14.84
+# WER on train_dev(fg)      15.34     13.69     13.61     13.58
+# WER on eval2000(tg)        19.2      17.7      17.8      17.8
+# WER on eval2000(fg)        17.7      16.2      16.4      16.5
+
+# Below is when it was run with learning-rate 0.000005.  It was best after 1st epoch.
+
+# steps/info/nnet3_disc_dir_info.pl exp/nnet3/tdnn_d_sp_smbr
+# exp/nnet3/tdnn_d_sp_smbr:num-jobs=4;effective-lrate=5e-06;iters-per-epoch=194;epoch[0,1,2,3]:train-objf=[0.87,0.91,0.92,0.93],valid-objf=[0.85,0.87,0.87,0.88],train-counts=[1.27,0.67,0.67,0.50],valid-counts=[1.11,0.64,0.61,0.58]
+
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:{1,2,3}_adj
+# System                tdnn_d_sp tdnn_d_sp_smbr:1_adj tdnn_d_sp_smbr:2_adj tdnn_d_sp_smbr:3_adj
+# WER on train_dev(tg)      16.51     14.94     14.85     14.91
+# WER on train_dev(fg)      15.34     13.66     13.76     13.77
+# WER on eval2000(tg)        19.2      17.7      17.9      18.1
+# WER on eval2000(fg)        17.7      16.2      16.5      16.6
+
+# below is with learning-rate 0.000005, showing results without prior-adjustment (the prior-adjustment
+# helps).
+# local/nnet3/compare_wer_general.sh tdnn_d_sp tdnn_d_sp_smbr:{1,2,3}
+# System                tdnn_d_sp tdnn_d_sp_smbr:1 tdnn_d_sp_smbr:2 tdnn_d_sp_smbr:3
+# WER on train_dev(tg)      16.51     15.06     15.05     15.04
+# WER on train_dev(fg)      15.34     13.88     13.92     13.85
+# WER on eval2000(tg)        19.2      17.9      18.1      18.2
+# WER on eval2000(fg)        17.7      16.4      16.7      16.9
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+graph_dir=exp/tri4/graph_sw1_tg
+srcdir=exp/nnet3/tdnn_d_sp
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+# originally ran with no affix, with effective_learning_rate=0.0000125;
+# reran by mistake with no affix with effective_learning_rate=0.000005 [was a bit
+# better, see NOTES, but still best after 1st epoch].
+# reran again with affix=slow and effective_learning_rate=0.0000025
+# reran again with affix=slow2 and effective_learning_rate=0.00000125 (this was
+# about the best).
+# before checking in the script, removed the slow2 affix but left with
+# the lowest learning rate.
+disc_affix=
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_decoding=200
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## Note: extra-left-context and extra-right-context are 0 because this is a TDNN,
+## it's not a recurrent model like an LSTM or BLSTM.
+extra_left_context=0
+extra_right_context=0
+
+
+## Nnet training options
+effective_learning_rate=0.00000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_decoding \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+        (
+          steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+            --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+            $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
+
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_${iter} || exit 1;
+        ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
new file mode 100755
index 00000000000..98cd8d5f34f
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_e.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# e is as c, but uses splicing similar to chain's without changing number of
+# layers.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_c    tdnn_e
+# WER on train_dev(tg)      17.37     16.75
+# WER on train_dev(fg)      15.94     15.34
+# WER on eval2000(tg)        20.0      19.5
+# WER on eval2000(fg)        18.2      18.0
+# Final train prob       -1.43781  -1.40491
+# Final valid prob       -1.56895  -1.55255
+
+
+stage=9
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+#exp/nnet3/tdnn_c_sp/egs
+reporting_email=
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_e
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 10 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $ali_dir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+graph_dir=exp/tri4/graph_sw1_tg
+if [ $stage -le 11 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
new file mode 100755
index 00000000000..a82b2078acb
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1a.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# _lfr1a is as _c, but is LFR (low frame rate): it uses triphone chain topology
+#  with a frame subsampling factor of 3.
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_c   tdnn_lfr1a
+# WER on train_dev(tg)      17.37     17.25
+# WER on train_dev(fg)      15.94     15.90
+# WER on eval2000(tg)        20.0      20.1
+# WER on eval2000(fg)        18.2      18.5
+# Final train prob       -1.43781  -1.32434
+# Final valid prob       -1.56895  -1.42206
+
+
+stage=11
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+reporting_email=
+remove_egs=true
+leftmost_questions_truncate=-1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_lfr1a
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/nnet3/tdnn_lfr1b_tree$suffix
+lang=data/lang_lfr1b
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --repeat-frames true --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 8400 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,2) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-3,3) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024
+  relu-renorm-layer name=tdnn5 dim=1024
+
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $treedir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+echo 3 >$dir/frame_subsampling_factor
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
new file mode 100755
index 00000000000..8c80dc3d7ad
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1b.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# _lfr1b is as _lfr1a, but with one more -3,3 layer (the comparable
+# non-LFR system is tdnn_d)
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_d  tdnn_lfr1a  tdnn_lfr1b
+# WER on train_dev(tg)      16.72     17.25     17.00
+# WER on train_dev(fg)      15.31     15.90     15.57
+# WER on eval2000(tg)        19.2      20.1      19.3
+# WER on eval2000(fg)        17.8      18.5      17.8
+# Final train prob       -1.22859  -1.32434  -1.11497
+# Final valid prob         -1.354  -1.42206  -1.21105
+
+
+
+stage=0
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+reporting_email=
+remove_egs=true
+leftmost_questions_truncate=-1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_lfr1b
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/nnet3/tdnn_lfr1b_tree$suffix
+lang=data/lang_lfr1b
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --repeat-frames true --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 8400 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,2) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-3,3) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-7,2) dim=1024
+  relu-renorm-layer name=tdnn6 dim=1024
+  
+  output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $treedir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+echo 3 >$dir/frame_subsampling_factor
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
new file mode 100755
index 00000000000..95cdbf7f975
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+# _lfr1c is as _lfr1a, but uses splicing similar to chain's without changing
+# number of layers (comparable non-LFR system is tdnn_e).
+
+# At this script level we don't support not running on GPU, as it would be painfully slow.
+# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
+# --num-threads 16 and --minibatch-size 128.
+
+# System                   tdnn_c    tdnn_e  tdnn_lfr1c
+# WER on train_dev(tg)      17.37     16.75     17.10
+# WER on train_dev(fg)      15.94     15.34     15.74
+# WER on eval2000(tg)        20.0      19.5      19.2
+# WER on eval2000(fg)        18.2      18.0      17.7
+# Final train prob       -1.43781  -1.40491  -1.29898
+# Final valid prob       -1.56895  -1.55255  -1.43117
+
+
+stage=11
+affix=
+train_stage=-10
+has_fisher=true
+speed_perturb=true
+common_egs_dir=
+#exp/nnet3/tdnn_lfr1b_sp/egs
+reporting_email=
+remove_egs=true
+leftmost_questions_truncate=-1
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+dir=exp/nnet3/tdnn_lfr1c
+dir=$dir${affix:+_$affix}
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/nnet3/tdnn_lfr1b_tree$suffix
+lang=data/lang_lfr1b
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+	--speed-perturb $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 10 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --repeat-frames true --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 8400 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 11 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=1024
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  
+  output-layer name=output input=tdnn5 dim=$num_targets max-change=1.5 presoftmax-scale-file=$dir/configs/presoftmax_prior_scale.vec
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.num-epochs 2 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.0017 \
+    --trainer.optimization.final-effective-lrate 0.00017 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --cleanup.preserve-model-interval 100 \
+    --use-gpu true \
+    --feat-dir=data/${train_set}_hires \
+    --ali-dir $treedir \
+    --lang data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+
+fi
+
+echo 3 >$dir/frame_subsampling_factor
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_sw1_tg $dir $graph_dir
+fi
+
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+    (
+    num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+    steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $num_jobs --cmd "$decode_cmd" \
+        --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+       $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_hires_sw1_tg || exit 1;
+    if $has_fisher; then
+	steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+	  $dir/decode_${decode_set}_hires_sw1_{tg,fsh_fg} || exit 1;
+    fi
+    ) &
+  done
+fi
+wait;
+exit 0;
+
diff --git a/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
new file mode 100755
index 00000000000..734c5a5d1be
--- /dev/null
+++ b/egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c_disc.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the CE nnet3 LFR system
+# from run_tdnn_lfr1c. To simplify things, this assumes you are using the
+# "speed-perturbed" data
+# (--speed_perturb true, which is the default) in the baseline run_tdnn_d.sh script.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the lattice generation runs in about real-time, so takes of the order of
+# 1000 hours of CPU time.
+
+# Comparing effect of shift:
+# System                tdnn_lfr1c_sp_smbr:1 tdnn_lfr1c_sp_smbr:2 tdnn_lfr1c_sp_smbr:3 tdnn_lfr1c_sp_fs_smbr:1 tdnn_lfr1c_sp_fs_smbr:2 tdnn_lfr1c_sp_fs_smbr:3
+# WER on train_dev(tg)      16.26     16.11     16.02     16.02     15.77     15.78
+# WER on train_dev(fg)      15.01     14.91     14.80     14.79     14.58     14.50
+# WER on eval2000(tg)        18.9      18.7      18.6      18.6      18.5      18.5
+# WER on eval2000(fg)        17.4      17.2      17.1      17.1      17.0      16.9
+
+
+set -e
+set -uo pipefail
+
+stage=0
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=65 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+# originally ran with no affix, with effective_learning_rate=0.0000125;
+# reran by mistake with no affix with effective_learning_rate=0.000005 [was a bit
+# better, see NOTES, but still best after 1st epoch].
+# reran again with affix=slow and effective_learning_rate=0.0000025
+# reran again with affix=slow2 and effective_learning_rate=0.00000125 (this was
+# about the best).
+# before checking in the script, removed the slow2 affix but left with
+# the lowest learning rate.
+disc_affix=
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_decoding=200
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## Note: extra-left-context and extra-right-context are 0 because this is a TDNN,
+## it's not a recurrent model like an LSTM or BLSTM.
+extra_left_context=0
+extra_right_context=0
+
+
+## Nnet training options
+effective_learning_rate=0.00000125
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+shift_feats=false
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+srcdir=exp/nnet3/tdnn_lfr1c_sp
+graph_dir=$srcdir/graph_sw1_tg
+train_data_dir=data/train_nodup_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_train_nodup_sp
+dir=${srcdir}_${criterion}${disc_affix}
+
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+
+frame_subsampling_factor=1
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+fi
+
+affix=    # Will be set if doing input frame shift
+if [[ "$shift_feats" = true && $frame_subsampling_factor -ne 1 ]]; then
+  if [ $stage -le 0 ]; then
+    utils/data/shift_and_combine_feats.sh --write-utt2orig $dir/utt2orig \
+					  $frame_subsampling_factor $train_data_dir ${train_data_dir}_fs
+    steps/online/nnet2/copy_ivector_dir.sh --utt2orig $dir/utt2orig \
+					  $online_ivector_dir ${online_ivector_dir}_fs
+    rm $dir/utt2orig
+  fi
+  online_ivector_dir=${online_ivector_dir}_fs
+  train_data_dir=${train_data_dir}_fs
+  affix=_fs
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --scale-opts '--transition-scale=1.0 --acoustic-scale=0.333 --self-loop-scale=0.333' \
+    --frames-per-chunk $frames_per_chunk_decoding \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali${affix} ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --self-loop-scale 0.333 --acwt 0.333 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_decoding" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali${affix} ${srcdir}_degs${affix} || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs${affix}
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --acoustic-scale 0.333 \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in train_dev eval2000; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+        (
+          steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+            --acwt 0.333 --post-decode-acwt 3.0 \
+            --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+            $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_sw1_tg_${iter} || exit 1;
+
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg}_${iter} || exit 1;
+        ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/online/run_nnet2_ms.sh b/egs/swbd/s5c/local/online/run_nnet2_ms.sh
index cb81d8dcbc3..2525aa85739 100755
--- a/egs/swbd/s5c/local/online/run_nnet2_ms.sh
+++ b/egs/swbd/s5c/local/online/run_nnet2_ms.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -16,13 +16,13 @@ has_fisher=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -31,7 +31,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -100,7 +100,7 @@ if [ $stage -le 9 ]; then
 fi
 
 if [ $stage -le 10 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   graph_dir=exp/tri4/graph_sw1_tg
   for data in eval2000_hires train_hires_dev; do
diff --git a/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh b/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh
index 9038c5fd115..5a120d1e00d 100755
--- a/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/swbd/s5c/local/online/run_nnet2_ms_perturbed.sh
@@ -7,7 +7,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=1
 train_stage=-10
 use_gpu=true
@@ -23,13 +23,13 @@ has_fisher=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -38,7 +38,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -173,7 +173,7 @@ if [ $stage -le 14 ]; then
 fi
 
 if [ $stage -le 15 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   graph_dir=exp/tri4/graph_sw1_tg
   for data in eval2000_hires train_hires_dev; do
diff --git a/egs/swbd/s5c/local/rt03_data_prep.sh b/egs/swbd/s5c/local/rt03_data_prep.sh
index a18637a6a16..d565b2b4b1a 100755
--- a/egs/swbd/s5c/local/rt03_data_prep.sh
+++ b/egs/swbd/s5c/local/rt03_data_prep.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# RT-03 data preparation (conversational telephone speech part only) 
+# RT-03 data preparation (conversational telephone speech part only)
 # Adapted from Arnab Ghoshal's script for Hub-5 Eval 2000 by Peng Qi
 
 # To be run from one directory above this script.
@@ -8,7 +8,8 @@
 # Expects the standard directory layout for RT-03
 
 if [ $# -ne 1 ]; then
-  echo "Usage: "`basename $0`" <rt03-dir>"
+  echo "Usage: $0 <rt03-dir>"
+  echo "e.g.: $0 /export/corpora/LDC/LDC2007S10"
   echo "See comments in the script for more details"
   exit 1
 fi
@@ -19,7 +20,7 @@ sdir=$1
 [ ! -d $sdir/data/references/eval03/english/cts ] \
   && echo Expecting directory $tdir/data/references/eval03/english/cts to be present && exit 1;
 
-. path.sh 
+. path.sh
 
 dir=data/local/rt03
 mkdir -p $dir
@@ -37,7 +38,7 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
 awk -v sph2pipe=$sph2pipe '{
-  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); 
+  printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
   printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);
 }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
 #side A - channel 1, side B - channel 2
@@ -47,7 +48,7 @@ awk -v sph2pipe=$sph2pipe '{
 # sw02001-A_000098-001156 sw02001-A 0.98 11.56
 #pem=$sdir/english/hub5e_00.pem
 #[ ! -f $pem ] && echo "No such file $pem" && exit 1;
-# pem file has lines like: 
+# pem file has lines like:
 # en_4156 A unknown_speaker 301.85 302.48
 
 #grep -v ';;' $pem \
@@ -59,7 +60,7 @@ cat $tdir/*.stm | grep -v ';;' | grep -v inter_segment_gap \
   | sort -u > $dir/segments
 
 # stm file has lines like:
-# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER 
+# en_4156 A en_4156_A 357.64 359.64 <O,en,F,en-F>  HE IS A POLICE OFFICER
 # TODO(arnab): We should really be lowercasing this since the Edinburgh
 # recipe uses lowercase. This is not used in the actual scoring.
 #grep -v ';;' $tdir/reference/hub5e00.english.000405.stm \
@@ -77,7 +78,7 @@ cat $tdir/*.stm | \
   grep -v inter_segment_gap | \
   awk '{
            printf $1; if ($1==";;") printf(" %s",$2); else printf(($2==1)?" A":" B"); for(n=3;n<=NF;n++) printf(" %s", $n); print ""; }'\
-  > $dir/stm  
+  > $dir/stm
 #$tdir/reference/hub5e00.english.000405.stm >  $dir/stm
 cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
 
@@ -87,10 +88,10 @@ cp $rtroot/data/trans_rules/en20030506.glm  $dir/glm
    echo "Segments from pem file and stm file do not match." && exit 1;
 
 grep -v IGNORE_TIME_SEGMENT_ $dir/text.all > $dir/text
-   
+
 # create an utt2spk file that assumes each conversation side is
 # a separate speaker.
-awk '{print $1,$2;}' $dir/segments > $dir/utt2spk  
+awk '{print $1,$2;}' $dir/segments > $dir/utt2spk
 utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
 
 # cp $dir/segments $dir/segments.tmp
@@ -110,4 +111,3 @@ done
 
 echo Data preparation and formatting completed for RT-03
 echo "(but not MFCC extraction)"
-
diff --git a/egs/swbd/s5c/local/swbd1_prepare_dict.sh b/egs/swbd/s5c/local/swbd1_prepare_dict.sh
index 673513806dc..3d9297b5f19 100755
--- a/egs/swbd/s5c/local/swbd1_prepare_dict.sh
+++ b/egs/swbd/s5c/local/swbd1_prepare_dict.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Formatting the Mississippi State dictionary for use in Edinburgh. Differs 
+# Formatting the Mississippi State dictionary for use in Edinburgh. Differs
 # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)
 
 # To be run from one directory above this script.
@@ -16,7 +16,7 @@ mkdir -p $dir
 srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
 
 # assume swbd_p1_data_prep.sh was done already.
-[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;
+[ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
 
 cp $srcdict $dir/lexicon0.txt || exit 1;
 patch <local/dict.patch $dir/lexicon0.txt || exit 1;
@@ -60,7 +60,7 @@ cp local/MSU_single_letter.txt $dir/
 # becomes
 # -B-
 # Also, curly braces, which appear to be used for "nonstandard"
-# words or non-words, are removed, e.g. 
+# words or non-words, are removed, e.g.
 # {WOLMANIZED} W OW L M AX N AY Z D
 # -> WOLMANIZED
 # Also, mispronounced words, e.g.
@@ -90,4 +90,3 @@ ln -sf lexicon5.txt lexicon.txt # This is the final lexicon.
 popd >&/dev/null
 rm $dir/lexiconp.txt 2>/dev/null
 echo Prepared input dictionary and phone-sets for Switchboard phase 1.
-
diff --git a/egs/swbd/s5c/run.sh b/egs/swbd/s5c/run.sh
index 0eafe73d046..8b08419007d 100755
--- a/egs/swbd/s5c/run.sh
+++ b/egs/swbd/s5c/run.sh
@@ -72,11 +72,16 @@ fi
 # local/eval2000_data_prep.sh /home/dpovey/data/LDC2002S09/hub5e_00 /home/dpovey/data/LDC2002T43
 local/eval2000_data_prep.sh /export/corpora2/LDC/LDC2002S09/hub5e_00 /export/corpora2/LDC/LDC2002T43
 
+# prepare the rt03 data.  Note: this isn't 100% necessary for this
+# recipe, not all parts actually test using rt03.
+local/rt03_data_prep.sh /export/corpora/LDC/LDC2007S10
+
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 mfccdir=mfcc
-for x in train eval2000; do
+for x in train eval2000 $maybe_rt03; do
   steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" \
     data/$x exp/make_mfcc/$x $mfccdir
   steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir
diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh
index ba7f120e599..4e0263d7cca 100644
--- a/egs/tedlium/s5/cmd.sh
+++ b/egs/tedlium/s5/cmd.sh
@@ -11,9 +11,9 @@
 #export cuda_cmd=run.pl
 
 # JHU cluster:
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* --mem 4G"
-export cuda_cmd="queue.pl -l arch=*64* --gpu 1"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export cuda_cmd="queue.pl --gpu 1"
 
 host=$(hostname -f)
 if [ ${host#*.} == "fit.vutbr.cz" ]; then
@@ -23,10 +23,10 @@ if [ ${host#*.} == "fit.vutbr.cz" ]; then
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 elif [ ${host#*.} == "cm.cluster" ]; then
   # MARCC bluecrab cluster:
   export train_cmd="slurm.pl --time 4:00:00 "
   export decode_cmd="slurm.pl --mem 4G --time 4:00:00 "
-  export cuda_cmd="slurm.pl --gpu 1" 
+  export cuda_cmd="slurm.pl --gpu 1"
 fi
diff --git a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
index a5b80505393..8d7393af853 100755
--- a/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/tedlium/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -4,7 +4,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 
 #%WER 13.3 | 507 17792 | 89.1 8.2 2.8 2.4 13.3 86.0 | -0.207 | exp/nnet3/tdnn_smbr/decode_dev_epoch1.adj/score_12_1.0/ctm.filt.filt.sys
 #%WER 12.4 | 507 17792 | 89.8 7.5 2.7 2.2 12.4 85.4 | -0.305 | exp/nnet3/tdnn_smbr/decode_dev_epoch1.adj_rescore/score_12_1.0/ctm.filt.filt.sys
@@ -52,27 +52,22 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true            # May need to be set to false 
-                              # because it does not help in some setups
-modify_learning_rates=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -103,7 +98,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -116,15 +111,15 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -137,15 +132,12 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -158,9 +150,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 graph_dir=exp/tri3/graph
@@ -170,7 +160,7 @@ if [ $stage -le 5 ]; then
       (
       num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
       iter=epoch$x.adj
-      
+
       steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
         --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${iter:+_$iter} || exit 1;
@@ -194,4 +184,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh b/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh
index d558415de64..3d58cd8f8dc 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ensemble.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1 --config conf/no_k20.conf --allow-k20 false"
+  parallel_opts="--gpu 1 --config conf/no_k20.conf --allow-k20 false"
 #that config is like the default config in the text of queue.pl, but adding the following lines.
 #default allow_k20=true
 #option allow_k20=true
@@ -40,7 +40,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -104,7 +104,7 @@ fi
 wait;
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms.sh b/egs/tedlium/s5/local/online/run_nnet2_ms.sh
index cb859bad011..9828966dabf 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms.sh
@@ -21,13 +21,13 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1 --config conf/no_k20.conf --allow-k20 false"
+  parallel_opts="--gpu 1 --config conf/no_k20.conf --allow-k20 false"
 #that config is like the default config in the text of queue.pl, but adding the following lines.
 #default allow_k20=true
 #option allow_k20=true
@@ -40,7 +40,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 # do the common parts of the script.
@@ -106,7 +106,7 @@ fi
 wait;
 
 if [ $stage -le 11 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh b/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
index 4505b89a273..1ab37ec4918 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms_disc.sh
@@ -6,7 +6,7 @@
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 # Note: rather than using any features we have dumped on disk, this script
 # regenerates them from the wav data three times-- when we do lattice
 # generation, numerator alignment and discriminative training.  This made the
@@ -40,19 +40,19 @@ set -e
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 if [ ! -f ${srcdir}_online/final.mdl ]; then
@@ -67,13 +67,13 @@ if [ $stage -le 1 ]; then
   num_threads_denlats=6
   subsplit=40 # number of jobs that run per job (but 2 run at a time, so total jobs is 80, giving
               # max total slots = 80 * 6 = 480.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_hires \
       --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
      data/train_hires data/lang $srcdir ${srcdir}_denlats || exit 1;
 
   # the command below is a more generic, but slower, way to do it.
-  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G -pe smp $num_threads_denlats" \
+  #steps/online/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G --num-threads $num_threads_denlats" \
   #    --nj $nj --sub-split $subsplit --num-threads "$num_threads_denlats" --config conf/decode.config \
   #   data/train_hires data/lang ${srcdir}_online ${srcdir}_denlats || exit 1;
 
diff --git a/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh b/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
index 7c0cbd6aed6..467d84389a5 100755
--- a/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
+++ b/egs/tedlium/s5/local/online/run_nnet2_ms_perturbed.sh
@@ -8,7 +8,7 @@
 # This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
 
 . ./cmd.sh
-set -e 
+set -e
 stage=0
 train_stage=-10
 use_gpu=true
@@ -22,13 +22,13 @@ has_fisher=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -37,7 +37,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
 fi
 
 
@@ -150,7 +150,7 @@ fi
 wait;
 
 if [ $stage -le 14 ]; then
-  # do the actual online decoding with iVectors, carrying info forward from 
+  # do the actual online decoding with iVectors, carrying info forward from
   # previous utterances of the same speaker.
   for decode_set in dev test; do
     num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
@@ -193,7 +193,7 @@ if [ $stage -le 17 ]; then
   cat <<EOF >${dir}_online/sample_decode.sh
 . cmd.sh
 data_dir=\$1  # e.g. data/dev_hires (to be prepared by the user, see egs/tedlium/run.sh for examples)
-model_dir=\$2 # e.g. exp/nnet2_online/nnet_ms_sp_online (provided in the distribution) 
+model_dir=\$2 # e.g. exp/nnet2_online/nnet_ms_sp_online (provided in the distribution)
 
 decode_dir=\$model_dir/\`basename \$data_dir\`
 num_jobs=\`cat \$data_dir/spk2utt | wc -l\`
diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index aebbd66349a..00b2d29cc88 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -1,64 +1,106 @@
 #!/bin/bash
 
-echo $0 $*
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3}
 
-echo -n "System               "
-for x in $*; do   printf "% 10s" " $(basename $x)";   done
-echo
 
-echo -n "WER on dev(orig)    "
-for x in $*; do
-  wer=$(grep Sum $x/decode_dev/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
+echo "# $0 $*"
 
-echo -n "WER on dev(rescored)"
-for x in $*; do
-  wer=$(grep Sum $x/decode_dev_rescore/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
-echo
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
 
-echo -n "WER on test(orig)    "
-for x in $*; do
-  wer=$(grep Sum $x/decode_test/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
-done
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain_cleaned/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
 echo
 
-echo -n "WER on test(rescored)"
-for x in $*; do
-  wer=$(grep Sum $x/decode_test_rescore/score*/*ys | utils/best_wer.sh | awk '{print $2}')
-  printf "% 10s" $wer
+strings=("# WER on dev(orig)     " "# WER on dev(rescored) " "# WER on test(orig)    " "# WER on test(rescored)")
+
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#         [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
 done
-echo
 
 
-echo -n "Final train prob     "
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob     "
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final valid prob     "
+echo -n "# Final valid prob     "
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final train prob (xent)"
+echo -n "# Final train prob (xent)"
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
 echo
 
-echo -n "Final valid prob (xent)"
+echo -n "# Final valid prob (xent)"
 for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
+
 echo
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
index 8e647598556..fbc28248491 120000
--- a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1e.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh
new file mode 120000
index 00000000000..d4268b4185a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/run_tdnn_lstm_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1e_disc.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..f7a18b4bfcf
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1c.sh
@@ -0,0 +1,295 @@
+#!/bin/bash
+
+# run_tdnn_1c.sh is like run_tdnn_1b.sh but changing chunk-width from 150 to
+# '140,110,160', and
+# and --trainer.num-chunk-per-minibatch from 128 to 128,64.
+# Not better; if anything a little worse.  But could possibly be noise.
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1b_sp_bi exp/chain_cleaned/tdnn1c_sp_bi
+# System                tdnn1b_sp_bi tdnn1c_sp_bi
+# WER on dev(orig)            9.4       9.8
+# WER on dev(rescored)        8.8       9.0
+# WER on test(orig)           9.6       9.7
+# WER on test(rescored)       9.0       9.2
+# Final train prob        -0.0870   -0.0942
+# Final valid prob        -0.1147   -0.1108
+# Final train prob (xent)   -1.4014   -1.4227
+# Final valid prob (xent)   -1.5634   -1.4884
+
+
+# run_tdnn_1b.sh is like run_tdnn_1a.sh but upgrading to xconfig-based
+# config generation.
+
+# Results (11/29/2016, note, this build is is before the upgrade of the LM
+#   done in Nov 2016):
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_sp_bi exp/chain_cleaned/tdnn1b_sp_bi
+# System                tdnn_sp_bi tdnn1b_sp_bi
+# WER on dev(orig)          10.3      10.2
+# WER on dev(rescored)       9.8       9.6
+# WER on test(orig)           9.8       9.7
+# WER on test(rescored)       9.3       9.2
+# Final train prob        -0.0918   -0.0928
+# Final valid prob        -0.1190   -0.1178
+# Final train prob (xent)   -1.3572   -1.4666
+# Final valid prob (xent)   -1.4415   -1.5473
+
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1c  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width '140,110,160' \
+    --trainer.num-chunk-per-minibatch '128,64' \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..99921a9bf61
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_1d.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+
+
+# run_tdnn_1d.sh is like run_tdnn_1b.sh but using 10 times the self-repair
+# scale on the 1st TDNN layer.
+# seems a little better- I wouldn't say it was significant normally, but
+# it definitely stops the 1st TDNN layer from having under/over-saturated
+# neurons.
+
+# exp/chain_cleaned/tdnn1b_sp_bi: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3606 combine=-0.10->-0.10 xent:train/valid[167,252,final]=(-1.47,-1.40,-1.40/-1.61,-1.57,-1.56) logprob:train/valid[167,252,final]=(-0.096,-0.087,-0.087/-0.119,-0.115,-0.115)
+# exp/chain_cleaned/tdnn1d_sp_bi: num-iters=253 nj=2..12 num-params=7.0M dim=40+100->3606 combine=-0.10->-0.10 xent:train/valid[167,252,final]=(-1.46,-1.39,-1.39/-1.61,-1.56,-1.55) logprob:train/valid[167,252,final]=(-0.096,-0.088,-0.088/-0.120,-0.115,-0.115)
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn1b_sp_bi exp/chain_cleaned/tdnn1d_sp_bi
+# System                tdnn1b_sp_bi tdnn1d_sp_bi
+# WER on dev(orig)            9.4       9.5
+# WER on dev(rescored)        8.8       8.6
+# WER on test(orig)           9.6       9.4
+# WER on test(rescored)       9.0       8.9
+# Final train prob        -0.0870   -0.0878
+# Final valid prob        -0.1147   -0.1152
+# Final train prob (xent)   -1.4014   -1.3921
+# Final valid prob (xent)   -1.5634   -1.5543
+
+# run_tdnn_1b.sh is like run_tdnn_1a.sh but upgrading to xconfig-based
+# config generation.
+
+
+## how you run this (note: this assumes that the run_tdnn.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn.sh
+
+# without cleanup:
+# local/chain/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run the corresponding non-chain nnet3 system
+# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism
+# to get the configuration.
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_affix=1d  #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=450 self-repair-scale=1.0e-04
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=450
+  relu-renorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=450
+  relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450
+  relu-renorm-layer name=tdnn6 input=Append(-6,-3,0) dim=450
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+
+fi
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width 150 \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..eb2c91dc3d4
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,317 @@
+#!/bin/bash
+
+# this is as run_tdnn_lstm_1a.sh, but changing
+# frames_per_chunk  150 to  140,100,160
+# and --trainer.num-chunk-per-minibatch from 128 to 128,64
+# and adding
+#    --egs.chunk-left-context-initial=0
+# and  --egs.chunk-right-context-final=0
+# See 1e for summary of results.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1b  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
new file mode 100755
index 00000000000..bb3c5b1a942
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1c.sh
@@ -0,0 +1,279 @@
+#!/bin/bash
+
+# 1c is as 1b, but adding the option --slow-start true. [since removed; it
+# takes half the param change from the first two minibatches of each
+# job].  The difference is probably just random noise.
+
+
+# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_lstm1b_sp_bi exp/chain_cleaned/tdnn_lstm1c_sp_bi
+# System                tdnn_lstm1b_sp_bi tdnn_lstm1c_sp_bi
+# WER on dev(orig)           9.1       8.9
+# WER on dev(rescored)       8.4       8.2
+# WER on test(orig)           8.9       8.9
+# WER on test(rescored)       8.4       8.5
+# Final train prob        -0.0621   -0.0620
+# Final valid prob        -0.0799   -0.0811
+# Final train prob (xent)   -0.8300   -0.8117
+# Final valid prob (xent)   -0.9500   -0.9448
+
+
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1c  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.slow-start true \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
new file mode 100755
index 00000000000..4be28a4ca97
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1d.sh
@@ -0,0 +1,313 @@
+#!/bin/bash
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+# See 1e for summary of results.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1d  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=40 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=40 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=40 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
new file mode 100755
index 00000000000..e56946c1b54
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
@@ -0,0 +1,395 @@
+#!/bin/bash
+
+# 1e is as 1d, but reducing decay-time from 40 to 20.
+
+# The following table shows comparison of various decay-time values,
+# namely: [b:unset=infinity, f:80, d:40, e:20, g:10, g2:5].
+# note: the g2 script is not checked in.
+# There is no clear trend on the non-looped decoding, but looped decoding seems
+# to improve as decay-time is decreased.  We end up recommending decay-time=20,
+# as by then we get all the improvement on looped decoding, and it's the
+# most conservative setting with which we can get this improvement (although
+# actually it seems fine to use an even smaller decay-time).
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{b,f,d,e,g,g2}_sp_bi
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1b_sp_bi exp/chain_cleaned/tdnn_lstm1f_sp_bi exp/chain_cleaned/tdnn_lstm1d_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1g_sp_bi exp/chain_cleaned/tdnn_lstm1g2_sp_bi
+# System                tdnn_lstm1b_sp_bi tdnn_lstm1f_sp_bi tdnn_lstm1d_sp_bi tdnn_lstm1e_sp_bi tdnn_lstm1g_sp_bi tdnn_lstm1g2_sp_bi
+# WER on dev(orig)            9.1       8.8       9.0       9.0       9.0       9.4
+#         [looped:]           9.4       9.3       9.2       9.0       8.9       9.4
+# WER on dev(rescored)        8.4       8.2       8.4       8.4       8.4       8.7
+#         [looped:]           8.8       8.7       8.6       8.4       8.3       8.7
+# WER on test(orig)           8.9       9.0       8.9       8.8       8.8       9.3
+#         [looped:]           9.3       9.3       9.0       8.8       8.8       9.2
+# WER on test(rescored)       8.4       8.6       8.3       8.4       8.4       8.9
+#         [looped:]           8.7       8.9       8.5       8.3       8.4       8.8
+# Final train prob        -0.0621   -0.0631   -0.0595   -0.0648   -0.0689   -0.0739
+# Final valid prob        -0.0799   -0.0802   -0.0823   -0.0827   -0.0890   -0.0963
+# Final train prob (xent)   -0.8300   -0.8295   -0.8129   -0.8372   -0.8610   -0.8792
+# Final valid prob (xent)   -0.9500   -0.9662   -0.9589   -0.9497   -0.9982   -1.0256
+
+
+# the following table compares the 'online' decoding with regular and looped
+# decoding.  online decoding is a little better than either (possibly due to
+# using slightly later iVectors).
+#
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi{,_online} 2>/dev/null
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_online
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_online
+# WER on dev(orig)            9.0       8.8
+#         [looped:]           9.0
+# WER on dev(rescored)        8.4       8.4
+#         [looped:]           8.4
+# WER on test(orig)           8.8       8.8
+#         [looped:]           8.8
+# WER on test(rescored)       8.4       8.4
+#         [looped:]           8.3
+
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+frames_per_chunk=140,100,160
+# decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1e  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=    # you can set this to use previously dumped egs.
+remove_egs=true
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs "$remove_egs" \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results very much (unlike
+  # regular decoding)... [it will affect them slightly due to differences in the
+  # iVector extraction; probably smaller will be worse as it sees less of the future,
+  # but in a real scenario, long chunks will introduce excessive latency].
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if $test_online_decoding && [ $stage -le 22 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       data/lang_chain exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --extra-left-context-initial $extra_left_context_initial \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
new file mode 100755
index 00000000000..0d64c75aea8
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e_disc.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+# This script does discriminative training on top of the 1e chain system.  To
+# simplify things, this assumes you are using the "cleaned" data (since this is
+# generally better), i.e. it won't work if you used options to run_tdnn_lstm_1e.sh
+# to use the non-cleaned data.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+
+# Below is with 0.00002 and last_layer_factor=0.5
+# this is the setting we're leaving in the script, but the discriminative training
+# is not really helping.  Maybe we should try the frame-shifted version.
+# steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2:num-jobs=4;effective-lrate=2e-05;last-layer-factor=0.50;iters-per-epoch=138;epoch[0,1,2]:train-objf=[0.94,0.96,0.97],valid-objf=[0.95,0.96,0.96],train-counts=[0.24,0.12,0.10],valid-counts=[0.28,0.20,0.17]
+# b01:s5_r2: steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbroutslow2:{1,2}
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_smbroutslow2:1 tdnn_lstm1e_sp_bi_smbroutslow2:2
+# WER on dev(orig)            9.0       8.9       8.9
+#         [looped:]           9.0       8.9       8.9
+# WER on dev(rescored)        8.4       8.3       8.4
+#         [looped:]           8.4       8.3       8.4
+# WER on test(orig)           8.8       8.7       8.8
+#         [looped:]           8.8       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.4
+#         [looped:]           8.3       8.4       8.5
+
+
+
+# Below is with 0.00002 and last_layer_factor=1.0.
+# b01:s5_r2: steps/info/nnet3_disc_dir_info.pl exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr:num-jobs=4;lrate=2e-05;iters-per-epoch=138;epoch[0,1,2]:train-objf=[0.94,0.96,0.97],valid-objf=[0.95,0.96,0.96],train-counts=[0.24,0.12,0.09],valid-counts=[0.28,0.19,0.16]
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr:{1,2}
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1e_sp_bi_smbr:1 tdnn_lstm1e_sp_bi_smbr:2
+# WER on dev(orig)            9.0       8.8       8.9
+#         [looped:]           9.0       8.9       8.9
+# WER on dev(rescored)        8.4       8.3       8.4
+#         [looped:]           8.4       8.3       8.4
+# WER on test(orig)           8.8       8.8       8.9
+#         [looped:]           8.8       8.8       8.9
+# WER on test(rescored)       8.4       8.4       8.5
+#         [looped:]           8.3       8.4       8.5
+
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+# you can set disc_affix if you run different configurations, e.g. --disc-affix "_b"
+disc_affix=
+
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+srcdir=exp/chain_cleaned/tdnn_lstm1e_sp_bi
+graph_dir=$srcdir/graph
+train_data_dir=data/train_cleaned_sp_hires_comb
+online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # frames-per-chunk for decoding in alignment and
+                          # denlat decoding.
+frames_per_chunk_decoding=140  # frames-per-chunk for decoding when we test
+                               # the models.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.00002
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=2
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+last_layer_factor=0.5    # have the output layer train slower than the others.. this can
+                         # be helpful.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --scale-opts "--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0" \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --self-loop-scale 1.0 --acwt 1.0 \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --criterion $criterion --drop-frames true  --acoustic-scale 1.0 \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --last-layer-factor $last_layer_factor \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+
+      (
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+         --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk "$frames_per_chunk_decoding" \
+         --extra-left-context $extra_left_context \
+         --extra-right-context $extra_right_context \
+         --extra-left-context-initial 0 --extra-right-context-final 0 \
+         --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang data/lang_rescore data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_rescore || exit 1;
+      ) &
+    done
+  done
+fi
+
+
+
+if [ $stage -le 5 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+      (
+        steps/nnet3/decode_looped.sh \
+          --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+          $graph_dir data/${decode_set}_hires $dir/decode_looped_${decode_set}_${iter} || exit 1;
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+          data/${decode_set}_hires \
+          ${dir}/decode_looped_${decode_set}_${iter} ${dir}/decode_looped_${decode_set}_${iter}_rescore || exit 1
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+wait;
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
new file mode 100755
index 00000000000..3ed14f30956
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1f.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+
+# 1f is as 1d, but increasing decay-time from 40 to 80.  [see also 1e, at 20.]
+# see 1e for summary of results.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1f  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=80 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=80 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=80 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
new file mode 100755
index 00000000000..aff39a04025
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1g.sh
@@ -0,0 +1,318 @@
+#!/bin/bash
+
+#######################
+# 1g is as 1e, but reducing decay-time further from 20 to 10.
+# see 1e for summary of results.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1g  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=10 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=10 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=10 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
new file mode 100755
index 00000000000..8ffd43f27bc
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1h.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+
+#######################
+# 1h is as 1e, but increasing decay-time from to to 30.
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1h  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=30 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=30 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=30 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
new file mode 100755
index 00000000000..62497ca59ff
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1i.sh
@@ -0,0 +1,337 @@
+#!/bin/bash
+
+# 1i is as 1e, but adding boundary-offset.  No clear effect.
+#
+# the 3 columns below are: baseline; boundary-offset with that component
+# learning with 10x the normal learning rate; boundary-offset with
+# regular learning rate.  There seems no clear benefit from this
+# idea.  Reverting the code changes that supported it;
+# see ~dpovey/patches/lstm_boundary.patch
+
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1i_sp_bi exp/chain_cleaned/tdnn_lstm1i_sp_bi.orig_learning_rate
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1i_sp_bi tdnn_lstm1i_sp_bi.orig_learning_rate
+# WER on dev(orig)            9.0       9.1       8.9
+#         [looped:]           9.0       9.0       9.0
+# WER on dev(rescored)        8.4       8.3       8.3
+#         [looped:]           8.4       8.2       8.2
+# WER on test(orig)           8.8       8.9       8.9
+#         [looped:]           8.8       8.9       8.9
+# WER on test(rescored)       8.4       8.4       8.4
+#         [looped:]           8.3       8.4       8.4
+# Final train prob        -0.0648   -0.0625   -0.0644
+# Final valid prob        -0.0827   -0.0833   -0.0855
+# Final train prob (xent)   -0.8372   -0.8129   -0.8286
+# Final valid prob (xent)   -0.9497   -0.9558   -0.9641
+
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1i  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 boundary-offset=true
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 boundary-offset=true
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3 boundary-offset=true
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
new file mode 100755
index 00000000000..c9a57f0ab4d
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1j.sh
@@ -0,0 +1,334 @@
+#!/bin/bash
+
+# 1j is as 1e, but adding self-repair-scale=1.0e-04 on 1st tdnn layer [default is 1e-5].
+# It's definitely more effective in preventing under or over-saturated ReLUs, but
+# it's not clear that there is any other benefit.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,j}_sp_bi
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1j_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1j_sp_bi
+# WER on dev(orig)            9.0       9.1
+#         [looped:]           9.0       9.1
+# WER on dev(rescored)        8.4       8.5
+#         [looped:]           8.4       8.5
+# WER on test(orig)           8.8       9.0
+#         [looped:]           8.8       9.1
+# WER on test(rescored)       8.4       8.6
+#         [looped:]           8.3       8.5
+# Final train prob        -0.0648   -0.0646
+# Final valid prob        -0.0827   -0.0835
+# Final train prob (xent)   -0.8372   -0.8296
+# Final valid prob (xent)   -0.9497   -0.9597
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1j  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512 self-repair-scale=1.0e-04
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
new file mode 100755
index 00000000000..ab9d6ce6342
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1k.sh
@@ -0,0 +1,339 @@
+#!/bin/bash
+
+# 1k is as 1e, but introducing a dropout schedule.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0
+#         [looped:]           9.0       8.6       8.9       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2
+#         [looped:]           8.4       7.8       8.2       8.3
+# WER on test(orig)           8.8       8.8       8.9       8.9
+#         [looped:]           8.8       8.7       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.2       8.5
+#         [looped:]           8.3       8.3       8.3       8.4
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629
+
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1k  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.7@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
new file mode 100755
index 00000000000..e09df86558a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1l.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+
+# 1l is as 1k, but having the dropout end at the end of training, not @0.75.
+
+# see run_tdnn_lstm_1k.sh for results.
+
+
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1l  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.7@0.5,0' \
+    --trainer.optimization.combine-sum-to-one-penalty=0.001 \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
new file mode 100755
index 00000000000..3e75c9fe3e0
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+
+
+# 1m is as 1l, but having the dropout end at 0.1
+# see run_tdnn_lstm_1k.sh for results.
+
+# 1l is as 1k, but having the dropout end at the end of training.
+
+# 1k is as 1e, but introducing a dropout schedule.
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1m  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.7@0.5,0.1' \
+    --trainer.optimization.combine-sum-to-one-penalty=0.001 \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
new file mode 100755
index 00000000000..ed79404f815
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# 1n is as 1k, but maxing out at 0.5, not 0.7.
+# 1k is as 1e, but introducing a dropout schedule.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0
+#         [looped:]           9.0       8.6       8.9       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2
+#         [looped:]           8.4       7.8       8.2       8.3
+# WER on test(orig)           8.8       8.8       8.9       8.9
+#         [looped:]           8.8       8.7       8.8       8.8
+# WER on test(rescored)       8.4       8.3       8.2       8.5
+#         [looped:]           8.3       8.3       8.3       8.4
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629
+
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1n  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.5@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
new file mode 100755
index 00000000000..ec97bce3a8b
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1o.sh
@@ -0,0 +1,344 @@
+#!/bin/bash
+
+# 1o is as 1k, but putting the dropout on (c,m), i.e. the output
+# of the LstmNonlinearityComponent, which I believe is the same as
+# putting it on (i,f) which Gaofeng found worked well in the non-fast Lstm
+# component; and using schedule which maxes out at 0.3, not 0.7.
+# [note: this was a little worse.  turns out it was not the same as
+# what gaofeng did because he had separate masks on (i,f).
+# note: I've since removed the script-level support for this.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,k,l,m,n,o}_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1k_sp_bi tdnn_lstm1l_sp_bi tdnn_lstm1m_sp_bi tdnn_lstm1n_sp_bi tdnn_lstm1o_sp_bi
+# WER on dev(orig)            9.0       8.7       8.9       9.0       8.8       8.8
+#         [looped:]           9.0       8.6       8.9       8.9       8.8       8.9
+# WER on dev(rescored)        8.4       7.9       8.2       8.2       8.1       8.1
+#         [looped:]           8.4       7.8       8.2       8.3       8.1       8.2
+# WER on test(orig)           8.8       8.8       8.9       8.9       8.7       8.7
+#         [looped:]           8.8       8.7       8.8       8.8       8.7       8.7
+# WER on test(rescored)       8.4       8.3       8.2       8.5       8.3       8.2
+#         [looped:]           8.3       8.3       8.3       8.5       8.3       8.2
+# Final train prob        -0.0648   -0.0693   -0.0768   -0.0807   -0.0702   -0.0698
+# Final valid prob        -0.0827   -0.0854   -0.0943   -0.0931   -0.0836   -0.0858
+# Final train prob (xent)   -0.8372   -0.8848   -0.9371   -0.9807   -0.8719   -0.8998
+# Final valid prob (xent)   -0.9497   -0.9895   -1.0546   -1.0629   -0.9732   -1.0084
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+# decode options
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=140
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1o  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  # note: the value of the dropout-proportion is not important, as it's
+  # controlled by the dropout schedule; what's important is that we set it.
+  lstmp_opts="decay-time=20 dropout-proportion=0.0 dropout-place=2 dropout-per-frame=true"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --trainer.dropout-schedule='0,0@0.20,0.3@0.5,0@0.75,0' \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir \
+    --cleanup=false
+ # --cleanup=false is temporary while debugging.
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
new file mode 100755
index 00000000000..b3da38e412a
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1r.sh
@@ -0,0 +1,339 @@
+#!/bin/bash
+
+# 1r is as 1e, but changing update-period of natural gradient from 4 to 1,
+# Not helpful.
+
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1{e,r}_sp_bi
+# local/chain/compare_wer_general.sh --looped exp/chain_cleaned/tdnn_lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1r_sp_bi
+# System                tdnn_lstm1e_sp_bi tdnn_lstm1r_sp_bi
+# WER on dev(orig)            9.0       9.0
+#         [looped:]           9.0       9.1
+# WER on dev(rescored)        8.4       8.5
+#         [looped:]           8.4       8.6
+# WER on test(orig)           8.8       9.1
+#         [looped:]           8.8       9.0
+# WER on test(rescored)       8.4       8.4
+#         [looped:]           8.3       8.5
+# Final train prob        -0.0648   -0.0642
+# Final valid prob        -0.0827   -0.0838
+# Final train prob (xent)   -0.8372   -0.8319
+# Final valid prob (xent)   -0.9497   -0.9635
+
+# 1e is as 1b, but reducing decay-time from 40 to 20.
+
+# 1d is as 1b, but adding decay-time=40 to the fast-lstmp-layers.  note: it
+# uses egs from 1b, remember to remove that before I commit.
+
+# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# exp/chain_cleaned/tdnn_lstm1a_sp_bi: num-iters=253 nj=2..12 num-params=9.5M dim=40+100->3607 combine=-0.07->-0.07 xent:train/valid[167,252,final]=(-0.960,-0.859,-0.852/-1.05,-0.999,-0.997) logprob:train/valid[167,252,final]=(-0.076,-0.064,-0.062/-0.099,-0.092,-0.091)
+
+# This is as run_lstm1e.sh except adding TDNN layers in between; also comparing below
+# with run_lstm1d.sh which had a larger non-recurrent-projection-dim and which had
+# better results.  Note: these results are not with the updated LM (the LM data-prep
+# for this setup was changed in Nov 2016 but this was with an older directory).
+#
+# local/chain/compare_wer_general.sh exp/chain_cleaned/lstm1d_sp_bi exp/chain_cleaned/lstm1e_sp_bi exp/chain_cleaned/tdnn_lstm1a_sp_bi
+# System                lstm1d_sp_bi lstm1e_sp_bi tdnn_lstm1a_sp_bi
+# WER on dev(orig)          10.3      10.7       9.7
+# WER on dev(rescored)       9.8      10.1       9.3
+# WER on test(orig)           9.7       9.8       9.1
+# WER on test(rescored)       9.2       9.4       8.7
+# Final train prob        -0.0812   -0.0862   -0.0625
+# Final valid prob        -0.1049   -0.1047   -0.0910
+# Final train prob (xent)   -1.1334   -1.1763   -0.8518
+# Final valid prob (xent)   -1.2263   -1.2427   -0.9972
+
+## how you run this (note: this assumes that the run_tdnn_lstm.sh soft link points here;
+## otherwise call it directly in its location).
+# by default, with cleanup:
+# local/chain/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/chain/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+# note, if you have already run one of the non-chain nnet3 systems
+# (e.g. local/nnet3/run_tdnn.sh), you may want to run with --stage 14.
+
+# run_tdnn_lstm_1a.sh was modified from run_lstm_1e.sh, which is a fairly
+# standard, LSTM, except that some TDNN layers were added in between the
+# LSTM layers.  I was looking at egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_1i.sh, but
+# this isn't exactly copied from there.
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+label_delay=5
+xent_regularize=0.1
+train_set=train_cleaned
+gmm=tri3_cleaned  # the gmm for the target data
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for nnet3 and chain dirs, e.g. _cleaned
+# training options
+chunk_left_context=40
+chunk_right_context=0
+chunk_left_context_initial=0
+chunk_right_context_final=0
+frames_per_chunk=140,100,160
+# decode options
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+extra_left_context=50
+extra_right_context=0
+extra_left_context_initial=0
+extra_right_context_final=0
+
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+train_stage=-10
+tree_affix=  # affix for tree directory, e.g. "a" or "b", in case we change the configuration.
+tdnn_lstm_affix=1r  #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration.
+common_egs_dir=exp/chain_cleaned/tdnn_lstm1b_sp_bi/egs  # you can set this to use previously dumped egs.
+remove_egs=true
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                             --nnet3-affix "$nnet3_affix"
+
+
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+tree_dir=exp/chain${nnet3_affix}/tree_bi${tree_affix}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_lstm_affix}_sp_bi
+train_data_dir=data/${train_set}_sp_hires_comb
+lores_train_data_dir=data/${train_set}_sp_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating lang directory with one state per phone."
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d data/lang_chain ]; then
+    if [ data/lang_chain/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: data/lang_chain already exists, not overwriting it; continuing"
+    else
+      echo "$0: data/lang_chain already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang data/lang_chain
+    silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >data/lang_chain/topo
+  fi
+fi
+
+if [ $stage -le 15 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 16 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --leftmost-questions-truncate -1 \
+      --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 17 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  tdnn_opts='ng-affine-options="update-period=1"'
+  lstmp_opts='ng-affine-options="update-period=1" decay-time=20'
+  output_opts='max-change=1.5 ng-affine-options="update-period=1"'
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512 $tdnn_opts
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) $tdnn_opts
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3) $tdnn_opts
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) $tdnn_opts
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3) $tdnn_opts
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3) $tdnn_opts
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstmp_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 18 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+ steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize 0.1 \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width "$frames_per_chunk" \
+    --egs.chunk-left-context "$chunk_left_context" \
+    --egs.chunk-right-context "$chunk_right_context" \
+    --egs.chunk-left-context-initial "$chunk_left_context_initial" \
+    --egs.chunk-right-context-final "$chunk_right_context_final" \
+    --trainer.num-chunk-per-minibatch 128,64 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.deriv-truncate-margin 10 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 2 \
+    --trainer.optimization.num-jobs-final 12 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --cleanup.remove-egs "$remove_egs" \
+    --feat-dir $train_data_dir \
+    --tree-dir $tree_dir \
+    --lat-dir $lat_dir \
+    --dir $dir
+fi
+
+
+
+if [ $stage -le 19 ]; then
+  # Note: it might appear that this data/lang_chain directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 20 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial $extra_left_context_initial \
+          --extra-right-context-final $extra_right_context_final \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 21 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context-initial $extra_left_context_initial \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0
diff --git a/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..da0bb728e69
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/compare_wer.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn_c_sp exp/nnet3_cleaned/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3_cleaned/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/nnet3_cleaned/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain_cleaned/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=("# WER on dev(orig)     " "# WER on dev(rescored) " "# WER on test(orig)    " "# WER on test(rescored)")
+
+for n in 0 1 2 3; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+     wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#         [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#         [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore)
+       wer=$(grep Sum ${dirname}_online/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
index b4f2dd3e3b4..16093616b05 100755
--- a/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
+++ b/egs/tedlium/s5_r2/local/nnet3/run_ivector_common.sh
@@ -21,9 +21,9 @@ num_threads_ubm=32
 nnet3_affix=_cleaned     # affix for exp/nnet3 directory to put iVector stuff in, so it
                          # becomes exp/nnet3_cleaned or whatever.
 
-. cmd.sh
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
 
 
 gmm_dir=exp/${gmm}
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 91ba913c183..00000000000
--- a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/bin/bash
-
-#    This is the standard "tdnn" system, built in nnet3; this script
-# is the version that's meant to run with data-cleanup, that doesn't
-# support parallel alignments.
-
-
-# by default, with cleanup:
-# local/nnet3/run_tdnn.sh
-
-# without cleanup:
-# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
-
-
-set -e -o pipefail -u
-
-# First the options that are passed through to run_ivector_common.sh
-# (some of which are also used in this script directly).
-stage=0
-nj=30
-decode_nj=30
-min_seg_len=1.55
-train_set=train_cleaned
-gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
-                  # should have alignments for the specified training data.
-num_threads_ubm=32
-nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
-tdnn_affix=  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
-
-# Options which are not passed through to run_ivector_common.sh
-train_stage=-10
-splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
-remove_egs=true
-relu_dim=850
-num_epochs=3
-
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage \
-                                  --nj $nj \
-                                  --min-seg-len $min_seg_len \
-                                  --train-set $train_set \
-                                  --gmm $gmm \
-                                  --num-threads-ubm $num_threads_ubm \
-                                  --nnet3-affix "$nnet3_affix"
-
-
-
-gmm_dir=exp/${gmm}
-graph_dir=$gmm_dir/graph
-ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
-dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
-train_data_dir=data/${train_set}_sp_hires_comb
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
-
-
-for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
-done
-
-
-if [ $stage -le 12 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/tdnn/train.sh --stage $train_stage \
-    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 12 \
-    --splice-indexes "$splice_indexes" \
-    --feat-type raw \
-    --online-ivector-dir ${train_ivector_dir} \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
-    --cmd "$decode_cmd" \
-    --relu-dim "$relu_dim" \
-    --remove-egs "$remove_egs" \
-    $train_data_dir data/lang $ali_dir $dir
-fi
-
-if [ $stage -le 13 ]; then
-  rm $dir/.error || true 2>/dev/null
-  for dset in dev test; do
-   (
-    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
-        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
-      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
-    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
-       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
-    ) || touch $dir/.error &
-  done
-  wait
-  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
-fi
-
-
-exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..61f8f499182
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1b.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh
new file mode 120000
index 00000000000..50d28fb91f3
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a_disc.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
new file mode 120000
index 00000000000..8e03c924bc1
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/run_tdnn_lstm_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_lfr_1a.sh
\ No newline at end of file
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..80ff91b8606
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1a  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -7,2 -3,3 0 0"
+remove_egs=true
+relu_dim=850
+num_epochs=3
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/tdnn/train.sh --stage $train_stage \
+    --num-epochs $num_epochs --num-jobs-initial 2 --num-jobs-final 12 \
+    --splice-indexes "$splice_indexes" \
+    --feat-type raw \
+    --online-ivector-dir ${train_ivector_dir} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
+    --cmd "$decode_cmd" \
+    --relu-dim "$relu_dim" \
+    --remove-egs "$remove_egs" \
+    $train_data_dir data/lang $ali_dir $dir
+fi
+
+if [ $stage -le 13 ]; then
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..f6e4fb71b75
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+
+# 1b is as 1a but uses xconfigs.
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1b_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1b  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+relu_dim=850
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=850
+  relu-renorm-layer name=tdnn2 dim=850 input=Append(-1,2)
+  relu-renorm-layer name=tdnn3 dim=850 input=Append(-3,3)
+  relu-renorm-layer name=tdnn4 dim=850 input=Append(-7,2)
+  relu-renorm-layer name=tdnn5 dim=850 input=Append(-3,3)
+  relu-renorm-layer name=tdnn6 dim=850
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..35789342ffb
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_1c.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# 1c is as 1b but using more 'chain-like' splicing and slightly
+# smaller dim.  Not better; maybe slightly worse.
+
+# note: the num-params is almost the same.
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1{b,c}_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+# exp/nnet3_cleaned/tdnn1c_sp: num-iters=240 nj=2..12 num-params=10.1M dim=40+100->4187 combine=-1.16->-1.15 loglike:train/valid[159,239,combined]=(-1.22,-1.16,-1.15/-1.41,-1.38,-1.38) accuracy:train/valid[159,239,combined]=(0.66,0.67,0.68/0.62,0.63,0.63)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1{b,c}_sp
+# System                tdnn1b_sp tdnn1c_sp
+# WER on dev(orig)           11.7      11.9
+# WER on dev(rescored)       10.9      11.1
+# WER on test(orig)          11.7      11.8
+# WER on test(rescored)      11.0      11.2
+# Final train prob        -0.9416   -1.1505
+# Final valid prob        -1.1496   -1.3805
+# Final train acc          0.7241    0.6756
+# Final valid acc          0.6788    0.6255
+
+#    This is the standard "tdnn" system, built in nnet3; this script
+# is the version that's meant to run with data-cleanup, that doesn't
+# support parallel alignments.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3_cleaned/tdnn1b_sp
+# exp/nnet3_cleaned/tdnn1b_sp: num-iters=240 nj=2..12 num-params=10.3M dim=40+100->4187 combine=-0.95->-0.95 loglike:train/valid[159,239,combined]=(-1.01,-0.95,-0.94/-1.18,-1.16,-1.15) accuracy:train/valid[159,239,combined]=(0.71,0.72,0.72/0.67,0.68,0.68)
+
+# local/nnet3/compare_wer.sh exp/nnet3_cleaned/tdnn1a_sp exp/nnet3_cleaned/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+# WER on dev(orig)           11.9      11.7
+# WER on dev(rescored)       11.2      10.9
+# WER on test(orig)          11.6      11.7
+# WER on test(rescored)      11.0      11.0
+# Final train prob        -0.9255   -0.9416
+# Final valid prob        -1.1842   -1.1496
+# Final train acc          0.7245    0.7241
+# Final valid acc          0.6771    0.6788
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1c  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
new file mode 100755
index 00000000000..666c2f1bb31
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lfr_1a.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+
+
+# run_tdnn_lfr_1a.sh is similar in configuration to run_tdnn_1c.sh, but it's a
+# low-frame-rate system (see egs/swbd/s5c/local/nnet3/tuning/run_tdnn_lfr1c.sh
+# for an example of such a system).
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lfr.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lfr.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+tdnn_affix=1a  #affix for TDNN directory e.g. "a" or "b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=dpovey@gmail.com
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lfr${tdnn_affix}_sp
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \
+    $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+fi
+
+
+if [ $stage -le 17 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for dset in dev test; do
+   (
+     steps/nnet3/decode.sh --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \
+        --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..28c45836cf7
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,219 @@
+#!/bin/bash
+
+# this is a TDNN+LSTM system; the configuration is similar to
+# local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
+# with 1.5 times larger hidden dimensions.
+
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp
+# WER on dev(orig)           11.0      11.0
+#         [looped:]          11.0      11.1
+# WER on dev(rescored)       10.4      10.3
+#         [looped:]          10.3      10.5
+# WER on test(orig)          10.7      10.6
+#         [looped:]          10.7      10.7
+# WER on test(rescored)      10.1       9.9
+#         [looped:]          10.0      10.0
+# Final train prob        -0.6881   -0.6897
+# Final valid prob        -0.7796   -0.7989
+# Final train acc          0.7954    0.7946
+# Final valid acc          0.7611    0.7582
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=15 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --frames-per-chunk $frames_per_chunk \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bd results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $graph_dir data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
new file mode 100755
index 00000000000..1826caf3d05
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# This script does discriminative training on top of CE nnet3 system.  To
+# simplify things, this assumes you are using the "cleaned" data (since this is
+# generally better), i.e. it won't work if you used options to run_tdnn_lstm_1a.sh
+# to use the non-cleaned data.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+# below is with the current settings (effective_learning_rate=0.0000025, last_layer_factor=0.5):
+# steps/info/nnet3_disc_dir_info.pl exp/nnet3_cleaned/tdnn_lstm1a_sp_smbrslow
+# exp/nnet3_cleaned/tdnn_lstm1a_sp_smbrslow:num-jobs=4;effective-lrate=2.5e-06;last-layer-factor=0.50;iters-per-epoch=55;epoch[0,1,2,3]:train-objf=[0.94,0.96,0.97,0.97],valid-objf=[0.91,0.93,0.93,0.93],train-counts=[0.40,0.25,0.17,0.12],valid-counts=[0.57,0.31,0.34,0.35]
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1a_sp_smbrslow:{1,2,3}
+# System                tdnn_lstm1a_sp tdnn_lstm1a_sp_smbrslow:1 tdnn_lstm1a_sp_smbrslow:2 tdnn_lstm1a_sp_smbrslow:3
+# WER on dev(orig)           11.0       9.4       9.4       9.4
+#         [looped:]          11.0       9.4       9.5       9.4
+# WER on dev(rescored)       10.3       8.8       8.7       8.7
+#         [looped:]          10.3       8.8       8.9       8.9
+# WER on test(orig)          10.8       9.6       9.7       9.6
+#         [looped:]          10.7       9.6       9.6       9.7
+# WER on test(rescored)      10.1       9.1       9.2       9.1
+#         [looped:]          10.0       9.1       9.2       9.1
+
+# Below is with twice the lrate (5e-06) and the same last-layer-factor (0.5).  Trained too fast.
+# exp/nnet3_cleaned/tdnn_lstm1a_sp_smbr:num-jobs=4;effective-lrate=5e-06;last-layer-factor=0.50;iters-per-epoch=55;epoch[0,1,2,3]:train-objf=[0.94,0.97,0.97,0.98],valid-objf=[0.91,0.93,0.93,0.93],train-counts=[0.40,0.22,0.12,0.09],valid-counts=[0.57,0.31,0.27,0.32]
+#  I'm not showing the looped decoding results with this older step;
+#  there was a script bug (now fixed) and I don't want to rerun them.
+# local/nnet3/compare_wer.sh  exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1a_sp_smbr:{1,2,3}
+# System                tdnn_lstm1a_sp tdnn_lstm1a_sp_smbr:1 tdnn_lstm1a_sp_smbr:2 tdnn_lstm1a_sp_smbr:3
+# WER on dev(orig)           11.0       9.4       9.4       9.5
+# WER on dev(rescored)       10.3       8.8       8.8       8.9
+# WER on test(orig)          10.8       9.6       9.8       9.8
+# WER on test(rescored)      10.1       9.1       9.3       9.4
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+graph_dir=exp/tri3_cleaned/graph
+srcdir=exp/nnet3_cleaned/tdnn_lstm1a_sp
+train_data_dir=data/train_cleaned_sp_hires_comb
+online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# originally ran with effective_learning_rate=0.000005,
+# changing to effective_learning_rate=0.0000025 and using affix=slow
+
+# you can set --disc-affix if you run different configurations.
+disc_affix=
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # for alignments and denlat creation.
+frames_per_chunk_decoding=50  # for decoding; should be the same as the value
+                              # used in the script that trained the nnet.
+                              # We didn't set the frames_per_chunk in
+                              # run_tdnn_lstm_1a.sh, so it defaults to 50.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## note: --extra-left-context should be the same as the chunk_left_context (or in
+## general, the argument of --egs.chunk-left-context) in the baseline script.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.0000025
+last_layer_factor=0.5
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --last-layer-factor $last_layer_factor \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+      (
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --frames-per-chunk "$frames_per_chunk_decoding" \
+        --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang data/lang_rescore data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_rescore || exit 1;
+      ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ]; then
+  # 'looped' decoding.  we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      iter=epoch$x
+      # We don't test the iter "epoch${x}_adj", although it's computed,
+      # because prior-adjustment doesn't make sense for chain models
+      # and it degrades the results.
+      (
+        steps/nnet3/decode_looped.sh \
+          --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+          --scoring-opts "--min-lmwt 5 " \
+          $graph_dir data/${decode_set}_hires $dir/decode_looped_${decode_set}_${iter} || exit 1;
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+          data/${decode_set}_hires \
+          ${dir}/decode_looped_${decode_set}_${iter} ${dir}/decode_looped_${decode_set}_${iter}_rescore || exit 1
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh
new file mode 100755
index 00000000000..8b8af6eff78
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+# 1b is as 1a, but removing the decay-time option as a baseline.
+
+# the decay-time option does seem to be having the expected interaction with
+# 'looped' decoding, i.e. with the decay-time option we don't get a degradation
+# from looped decoding (if anything, with decay time, looped decoding is a
+# little better than baseline decoding).
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp
+# WER on dev(orig)           11.0      11.0
+#         [looped:]          11.0      11.1
+# WER on dev(rescored)       10.3      10.3
+#         [looped:]          10.3      10.5
+# WER on test(orig)          10.8      10.6
+#         [looped:]          10.7      10.7
+# WER on test(rescored)      10.1       9.9
+#         [looped:]          10.0      10.0
+# Final train prob        -0.6881   -0.6897
+# Final valid prob        -0.7796   -0.7989
+# Final train acc          0.7954    0.7946
+# Final valid acc          0.7611    0.7582
+
+
+
+# this is a TDNN+LSTM system; the configuration is similar to
+# local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
+# with 1.5 times larger hidden dimensions.
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=15 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # caution: we don't set the --frames-per-chunk here, we just use the
+      # default value of 50, which happens to be suitable because it's
+      # close to the primary chunk_width of 40.
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bd results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $graph_dir data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
new file mode 100755
index 00000000000..07c3d4af233
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1b_disc.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+
+# This script does discriminative training on top of CE nnet3 system.  To
+# simplify things, this assumes you are using the "cleaned" data (since this is
+# generally better), i.e. it won't work if you used options to run_tdnn_lstm_1b.sh
+# to use the non-cleaned data.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=400 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+graph_dir=exp/tri3_cleaned/graph
+srcdir=exp/nnet3_cleaned/tdnn_lstm1b_sp
+train_data_dir=data/train_cleaned_sp_hires_comb
+online_ivector_dir=exp/nnet3_cleaned/ivectors_train_cleaned_sp_hires_comb
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# you can set --disc-affix if you run different configurations, e.g. --disc-affix "_b"
+# note, I ran without affix with learning rate 0.0000125, with disc_affic=slow
+# with learning rate 0.000005, and with disc_affix=slow2 with learning rate 0.0000025.
+# disc_affix=slow3 is with effective_learning_rate=0.000005 and last_layer_factor=0.1
+
+disc_affix=slow3
+
+dir=${srcdir}_${criterion}${disc_affix}
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # for alignments and denlat creation.
+frames_per_chunk_decoding=50  # for decoding; should be the same as the value
+                              # used in the script that trained the nnet.
+                              # We didn't set the frames_per_chunk in
+                              # run_tdnn_lstm_1b.sh, so it defaults to 50.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## note: --extra-left-context should be the same as the chunk_left_context (or in
+## general, the argument of --egs.chunk-left-context) in the baseline script.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.000005
+last_layer_factor=0.1
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=2
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [ ! -f ${srcdir}/final.mdl ]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --last-layer-factor $last_layer_factor \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for decode_set in dev test; do
+      num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
+      for iter in epoch$x epoch${x}_adj; do
+      (
+        steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" --iter $iter \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --frames-per-chunk "$frames_per_chunk_decoding" \
+        --online-ivector-dir exp/nnet3_cleaned/ivectors_${decode_set}_hires \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${iter} || exit 1;
+
+        steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+          data/lang data/lang_rescore data/${decode_set}_hires \
+          $dir/decode_${decode_set}_${iter} \
+          $dir/decode_${decode_set}_${iter}_rescore || exit 1;
+      ) &
+      done
+    done
+  done
+fi
+wait;
+
+if [ $stage -le 5 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
new file mode 100755
index 00000000000..bc9a717419d
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_1c.sh
@@ -0,0 +1,236 @@
+#!/bin/bash
+
+# run_tdnn_lstm_1c.sh is as run_tdnn_lstm_1a.sh, but about 1.5 times larger
+# chunk lengths than 1a.
+# There doesn't seem to be any advantage in the longer chunk lengths.
+
+# this is a TDNN+LSTM system; the configuration is similar to
+# local/chain/tuning/run_tdnn_lstm_1e.sh, but a non-chain nnet3 system, and
+# with 1.5 times larger hidden dimensions.
+
+# exp/nnet3_cleaned/tdnn_lstm1c_sp: num-iters=246 nj=3..15 num-params=18.7M dim=40+100->4187 combine=-0.67->-0.66 loglike:train/valid[163,245,combined]=(-0.71,-0.63,-0.60/-0.92,-0.88,-0.85) accuracy:train/valid[163,245,combined]=(0.77,0.79,0.80/0.74,0.75,0.75)
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3_cleaned/tdnn_lstm1a_sp exp/nnet3_cleaned/tdnn_lstm1b_sp exp/nnet3_cleaned/tdnn_lstm1c_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b_sp tdnn_lstm1c_sp
+# WER on dev(orig)           11.0      11.0      11.0
+#         [looped:]          11.0      11.1      10.9
+# WER on dev(rescored)       10.3      10.3      10.4
+#         [looped:]          10.3      10.5      10.3
+# WER on test(orig)          10.8      10.6      10.8
+#         [looped:]          10.7      10.7      10.7
+# WER on test(rescored)      10.1       9.9      10.1
+#         [looped:]          10.0      10.0      10.1
+# Final train prob        -0.6881   -0.6897   -0.5998
+# Final valid prob        -0.7796   -0.7989   -0.8542
+# Final train acc          0.7954    0.7946    0.7988
+# Final valid acc          0.7611    0.7582    0.7521
+
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1c
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=60,50,40,30
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+extra_left_context=
+extra_right_context=
+frames_per_chunk=
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+graph_dir=$gmm_dir/graph
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}
+dir=${dir}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+     $graph_dir/HCLG.fst $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=15 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width;
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+    steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      ${graph_dir} data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+      steps/nnet3/decode_looped.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $graph_dir data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
new file mode 100755
index 00000000000..3e8509bf4ac
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_lfr_1a.sh is like run_tdnn_lstm_1a.sh, but
+# it's a low-frame-rate system. (however, using num-jobs-final=10,
+# not 15, which was very high).
+
+
+# Generally the WER is the same or slightly better than before.
+
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp  2>/dev/null
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3_cleaned/tdnn_lstm1c_sp exp/nnet3_cleaned/tdnn_lstm_lfr1a_sp
+# System                tdnn_lstm1c_sp tdnn_lstm_lfr1a_sp
+# WER on dev(orig)           11.0      10.9
+#         [looped:]          10.9      10.9
+#         [online:]                    10.8
+# WER on dev(rescored)       10.4      10.3
+#         [looped:]          10.3      10.3
+#         [online:]                    10.3
+# WER on test(orig)          10.8      10.7
+#         [looped:]          10.7      10.7
+#         [online:]                    10.7
+# WER on test(rescored)      10.1      10.2
+#         [looped:]          10.1      10.1
+#         [online:]                    10.2
+# Final train prob        -0.5998   -0.5437
+# Final valid prob        -0.8542   -0.7286
+# Final train acc          0.7988    0.8343
+# Final valid acc          0.7521    0.7888
+
+
+# by default, with cleanup:
+# local/nnet3/run_tdnn_lstm_lfr.sh
+
+# without cleanup:
+# local/nnet3/run_tdnn_lstm_lfr.sh  --train-set train --gmm tri3 --nnet3-affix "" &
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+decode_nj=30
+min_seg_len=1.55
+train_set=train_cleaned
+gmm=tri3_cleaned  # this is the source gmm-dir for the data-type of interest; it
+                  # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=_cleaned  # cleanup affix for exp dirs, e.g. _cleaned
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+# decode chunk-size options (for non-looped decoding)
+extra_left_context=50
+extra_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --nj $nj \
+                                  --min-seg-len $min_seg_len \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp_comb
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm_lfr${affix}_sp
+train_data_dir=data/${train_set}_sp_hires_comb
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp_comb \
+    $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=768
+  relu-renorm-layer name=tdnn2 dim=768 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=768 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=768 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=768 recurrent-projection-dim=192 non-recurrent-projection-dim=192 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh data/lang/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang $dir $dir/graph
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+   (
+     steps/nnet3/decode.sh \
+       --acwt 0.333 --post-decode-acwt 3.0 --nj $decode_nj \
+       --cmd "$decode_cmd"  --num-threads 4 \
+       --extra-left-context $chunk_left_context \
+       --extra-right-context $chunk_right_context \
+       --frames-per-chunk $frames_per_chunk \
+       --extra-left-context-initial 0 --extra-right-context-final 0 \
+       --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+      $dir/graph data/${dset}_hires ${dir}/decode_${dset} || exit 1
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+       data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+if [ $stage -le 18 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+      (
+        steps/nnet3/decode_looped.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --nj $decode_nj --cmd "$decode_cmd" \
+          --frames-per-chunk 30 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \
+         $dir/graph data/${dset}_hires $dir/decode_looped_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}/decode_looped_${dset} ${dir}/decode_looped_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for dset in dev test; do
+    (
+      # note: we just give it "$dset" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --scoring-opts "--min-lmwt 5 " \
+         $dir/graph data/${dset} ${dir}_online/decode_${dset} || exit 1;
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \
+        data/${dset}_hires ${dir}_online/decode_${dset} ${dir}_online/decode_${dset}_rescore || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+
+exit 0;
diff --git a/egs/tedlium/s5_r2/run.sh b/egs/tedlium/s5_r2/run.sh
index 19bc92a738c..754cec0494d 100755
--- a/egs/tedlium/s5_r2/run.sh
+++ b/egs/tedlium/s5_r2/run.sh
@@ -185,7 +185,7 @@ fi
 if [ $stage -le 17 ]; then
   # This will only work if you have GPUs on your system (and note that it requires
   # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html)
-  local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix ""
+  local/chain/run_tdnn.sh
 fi
 
 # The nnet3 TDNN recipe:
diff --git a/egs/voxforge/s5/path.sh b/egs/voxforge/s5/path.sh
index d5ee6268bae..ff3c4ab6f14 100755
--- a/egs/voxforge/s5/path.sh
+++ b/egs/voxforge/s5/path.sh
@@ -5,7 +5,7 @@ export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 . $KALDI_ROOT/tools/config/common_path.sh
 
 # VoxForge data will be stored in:
-export DATA_ROOT="/home/dpovey/kaldi-clean/egs/voxforge/s5/voxforge"    # e.g. something like /media/secondary/voxforge
+# export DATA_ROOT="$KALDI_ROOT/egs/voxforge/s5/voxforge"    # e.g. something like /media/secondary/voxforge
 
 if [ -z $DATA_ROOT ]; then
   echo "You need to set \"DATA_ROOT\" variable in path.sh to point to the directory to host VoxForge's data"
diff --git a/egs/voxforge/s5/run.sh b/egs/voxforge/s5/run.sh
index 280e47328b5..277d41039ea 100755
--- a/egs/voxforge/s5/run.sh
+++ b/egs/voxforge/s5/run.sh
@@ -5,8 +5,9 @@
 
 # NOTE: You will want to download the data set first, before executing this script.
 #       This can be done for example by:
-#       1. Setting the DATA_ROOT variable to point to a directory with enough free
-#          space (at least 20-25GB currently (Feb 2014))
+#       1. Setting the variable DATA_ROOT in path.sh to point to a
+#          directory with enough free space (at least 20-25GB
+#          currently (Feb 2014))
 #       2. Running "getdata.sh"
 
 # The second part of this script comes mostly from egs/rm/s5/run.sh
diff --git a/egs/vystadial_cz/online_demo/Makefile b/egs/vystadial_cz/online_demo/Makefile
index 14042f5a70a..49e3a85fa43 100644
--- a/egs/vystadial_cz/online_demo/Makefile
+++ b/egs/vystadial_cz/online_demo/Makefile
@@ -1,6 +1,6 @@
 BEST_LINE=18
-MODEL_PREFIX_URL=http://vystadial.ms.mff.cuni.cz/download/kaldi/src/pykaldi/pykaldi/binutils/
-DATA_PREFIX_URL=http://vystadial.ms.mff.cuni.cz/download/kaldi/src/pykaldi/pykaldi/binutils/
+MODEL_PREFIX_URL=https://vystadial.ms.mff.cuni.cz/download/pykaldi/egs/vystadial/online_demo
+DATA_PREFIX_URL=https://vystadial.ms.mff.cuni.cz/download/pykaldi/egs/vystadial/online_demo
 
 # Czech language models 
 LANG=cs
diff --git a/egs/wsj/s5/RESULTS b/egs/wsj/s5/RESULTS
index acff4f9d7fe..e6732d21074 100644
--- a/egs/wsj/s5/RESULTS
+++ b/egs/wsj/s5/RESULTS
@@ -1,8 +1,15 @@
 #!/bin/bash
 
-# this RESULTS file was obtained by Haihua Xu in July 2013.
-
-for x in exp/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+# this RESULTS file was obtained by Dan Povey in Feb 2017, after
+# a rewrite of the run.sh file.
+# To see results from the scripts local/nnet3/ and local/chain/,
+# look at the top of those files, we don't put those in the
+# RESULTS file.
+
+for dir in exp/*; do
+  steps/info/gmm_dir_info.pl $dir
+  for x in $dir/decode*dev93* $dir/decode*eval92*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done
+done
 exit 0
 
 # Use caution when comparing these results with other published results.
@@ -13,107 +20,76 @@ exit 0
 # in which we only test on utterances that are in either a 5k or 20k subset
 # of the vocabulary.
 
-# The following results are updated with LDA+MLLT to use 7, not 9 frames of context,
-# and also increased the learning rate for the "indirect" fMMI.
-
 # monophone, deltas, trained on the 2k shortest utterances from the si84 data.
-%WER 35.39 [ 2914 / 8234, 284 ins, 467 del, 2163 sub ] exp/mono0a/decode_tgpr_dev93/wer_10
-%WER 25.78 [ 1455 / 5643, 142 ins, 184 del, 1129 sub ] exp/mono0a/decode_tgpr_eval92/wer_9
+exp/mono0a: nj=10 align prob=-95.82 over 2.36h [retry=0.4%, fail=0.0%] states=132 gauss=973
+%WER 34.33 [ 2827 / 8234, 266 ins, 457 del, 2104 sub ] exp/mono0a/decode_nosp_tgpr_dev93/wer_10_0.0
+%WER 25.13 [ 1418 / 5643, 138 ins, 192 del, 1088 sub ] exp/mono0a/decode_nosp_tgpr_eval92/wer_10_0.0
+
+
 
 # first triphone build.  Built on half of SI-84.
-%WER 20.00 [ 1647 / 8234, 257 ins, 197 del, 1193 sub ] exp/tri1/decode_tgpr_dev93/wer_17
-%WER 13.04 [ 736 / 5643, 137 ins, 61 del, 538 sub ] exp/tri1/decode_tgpr_eval92/wer_14
+exp/tri1: nj=10 align prob=-93.75 over 7.38h [retry=0.4%, fail=0.0%] states=1567 gauss=10025 tree-impr=5.06
+%WER 19.40 [ 1597 / 8234, 247 ins, 199 del, 1151 sub ] exp/tri1/decode_nosp_tgpr_dev93/wer_14_0.5
+%WER 12.76 [ 720 / 5643, 110 ins, 89 del, 521 sub ] exp/tri1/decode_nosp_tgpr_eval92/wer_14_1.0
 
-# the same, rescored with full trigram model [not pruned.]  Note: the tg{1,2,3,4} are
+# the above, rescored with full trigram model [not pruned.]  Note: the tg{1,2,3,4} are
 # different rescoring methods.  They all give about the same results.  Note: 3 and 4 give
 # the "correct" LM scores.
-%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg1/wer_14
-%WER 18.87 [ 1554 / 8234, 295 ins, 136 del, 1123 sub ] exp/tri1/decode_tgpr_dev93_tg2/wer_14
-%WER 18.75 [ 1544 / 8234, 266 ins, 152 del, 1126 sub ] exp/tri1/decode_tgpr_dev93_tg3/wer_15
-%WER 18.76 [ 1545 / 8234, 266 ins, 152 del, 1127 sub ] exp/tri1/decode_tgpr_dev93_tg4/wer_15
-
-# tri2a is delta+delta-delta features.
-%WER 17.93 [ 1476 / 8234, 256 ins, 161 del, 1059 sub ] exp/tri2a/decode_tgpr_dev93/wer_16
-%WER 12.42 [ 701 / 5643, 132 ins, 64 del, 505 sub ] exp/tri2a/decode_tgpr_eval92/wer_15
-# just demonstrates how to do decoding constrained by lattices.
-%WER 16.76 [ 1380 / 8234, 275 ins, 132 del, 973 sub ] exp/tri2a/decode_tgpr_dev93_fromlats/wer_16
-
-# This is an LDA+MLLT system. 
-%WER 16.43 [ 1353 / 8234, 241 ins, 162 del, 950 sub ] exp/tri2b/decode_tgpr_dev93/wer_16
-%WER 10.69 [ 603 / 5643, 154 ins, 47 del, 402 sub ] exp/tri2b/decode_tgpr_eval92/wer_14
-
-# rescoring the lattices with trigram.
-%WER 15.29 [ 1252 / 8191, 219 ins, 153 del, 880 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg/wer_18
-# using the "biglm" decoding method to avoid the lattice rescoring step [not faster though.]
-%WER 15.31 [ 1261 / 8234, 227 ins, 158 del, 876 sub ] exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_18
-# using a Minimum Bayes Risk decoding method on top of the _tg lattices.
-%WER 15.15 [ 1241 / 8191, 221 ins, 155 del, 865 sub ] [PARTIAL] exp/tri2b/decode_tgpr_dev93_tg_mbr/wer_18
-
-# fMMI, default learning rate (0.001)
-
-%WER 15.19 [ 1251 / 8234, 213 ins, 148 del, 890 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.14 [ 1247 / 8234, 228 ins, 138 del, 881 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it4/wer_14
-%WER 15.06 [ 1240 / 8234, 211 ins, 152 del, 877 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it5/wer_15
-%WER 15.01 [ 1236 / 8234, 206 ins, 154 del, 876 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it6/wer_15
-%WER 14.99 [ 1234 / 8234, 210 ins, 159 del, 865 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it7/wer_15
-%WER 15.23 [ 1254 / 8234, 200 ins, 184 del, 870 sub ] exp/tri2b_fmmi_b0.1/decode_tgpr_dev93_it8/wer_16
-
-%WER 15.55 [ 1280 / 8234, 234 ins, 151 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it3/wer_15
-%WER 15.63 [ 1287 / 8234, 242 ins, 150 del, 895 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it4/wer_15
-%WER 15.30 [ 1260 / 8234, 224 ins, 143 del, 893 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it5/wer_15
-%WER 15.34 [ 1263 / 8234, 216 ins, 156 del, 891 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it6/wer_16
-%WER 15.34 [ 1263 / 8234, 242 ins, 139 del, 882 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it7/wer_14
-%WER 15.30 [ 1260 / 8234, 245 ins, 134 del, 881 sub ] exp/tri2b_fmmi_b0.1_lr0.005/decode_tgpr_dev93_it8/wer_13
-
-%WER 15.21 [ 1252 / 8234, 218 ins, 148 del, 886 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.16 [ 1248 / 8234, 205 ins, 159 del, 884 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it4/wer_16
-%WER 15.22 [ 1253 / 8234, 229 ins, 147 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it5/wer_15
-%WER 14.90 [ 1227 / 8234, 203 ins, 150 del, 874 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it6/wer_15
-%WER 14.95 [ 1231 / 8234, 202 ins, 152 del, 877 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it7/wer_15
-%WER 15.18 [ 1250 / 8234, 184 ins, 172 del, 894 sub ] exp/tri2b_fmmi_indirect_b0.1/decode_tgpr_dev93_it8/wer_16
-
-%WER 15.70 [ 1293 / 8234, 218 ins, 163 del, 912 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it3/wer_16
-%WER 15.61 [ 1285 / 8234, 217 ins, 163 del, 905 sub ] exp/tri2b_mmi/decode_tgpr_dev93_it4/wer_16
-%WER 10.46 [ 590 / 5643, 125 ins, 51 del, 414 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it3/wer_15
-%WER 10.40 [ 587 / 5643, 124 ins, 52 del, 411 sub ] exp/tri2b_mmi/decode_tgpr_eval92_it4/wer_16
-
-%WER 15.56 [ 1281 / 8234, 224 ins, 152 del, 905 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it3/wer_15
-%WER 15.44 [ 1271 / 8234, 220 ins, 165 del, 886 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_dev93_it4/wer_16
-%WER 10.33 [ 583 / 5643, 125 ins, 51 del, 407 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it3/wer_15
-%WER 10.33 [ 583 / 5643, 125 ins, 47 del, 411 sub ] exp/tri2b_mmi_b0.1/decode_tgpr_eval92_it4/wer_15
-
-%WER 11.43 [ 941 / 8234, 113 ins, 144 del, 684 sub ] exp/tri3b/decode_bd_tgpr_dev93/wer_19
-%WER 16.09 [ 1325 / 8234, 193 ins, 185 del, 947 sub ] exp/tri3b/decode_bd_tgpr_dev93.si/wer_16
-%WER 6.79 [ 383 / 5643, 51 ins, 49 del, 283 sub ] exp/tri3b/decode_bd_tgpr_eval92/wer_18
-%WER 10.61 [ 599 / 5643, 91 ins, 74 del, 434 sub ] exp/tri3b/decode_bd_tgpr_eval92.si/wer_15
-%WER 5.74 [ 324 / 5643, 46 ins, 41 del, 237 sub ] exp/tri3b/decode_bd_tgpr_eval92_fg/wer_19
-%WER 5.90 [ 333 / 5643, 46 ins, 39 del, 248 sub ] exp/tri3b/decode_bd_tgpr_eval92_tg/wer_18
-
-%WER 14.17 [ 1167 / 8234, 222 ins, 123 del, 822 sub ] exp/tri3b/decode_tgpr_dev93/wer_17
-%WER 19.37 [ 1595 / 8234, 315 ins, 153 del, 1127 sub ] exp/tri3b/decode_tgpr_dev93.si/wer_15
-
-%WER 12.98 [ 1069 / 8234, 209 ins, 116 del, 744 sub ] exp/tri3b/decode_tgpr_dev93_tg/wer_19
-%WER 9.30 [ 525 / 5643, 120 ins, 37 del, 368 sub ] exp/tri3b/decode_tgpr_eval92/wer_18
-%WER 12.95 [ 731 / 5643, 167 ins, 46 del, 518 sub ] exp/tri3b/decode_tgpr_eval92.si/wer_14
-%WER 8.54 [ 482 / 5643, 113 ins, 29 del, 340 sub ] exp/tri3b/decode_tgpr_eval92_tg/wer_17
-
-%WER 12.12 [ 998 / 8234, 209 ins, 88 del, 701 sub ] exp/tri4a/decode_tgpr_dev93/wer_17
-%WER 15.98 [ 1316 / 8234, 275 ins, 119 del, 922 sub ] exp/tri4a/decode_tgpr_dev93.si/wer_15
-%WER 7.83 [ 442 / 5643, 107 ins, 23 del, 312 sub ] exp/tri4a/decode_tgpr_eval92/wer_16
-%WER 10.90 [ 615 / 5643, 148 ins, 30 del, 437 sub ] exp/tri4a/decode_tgpr_eval92.si/wer_13
-
-%WER 9.15 [ 753 / 8234, 90 ins, 113 del, 550 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93/wer_16
-%WER 12.64 [ 1041 / 8234, 137 ins, 145 del, 759 sub ] exp/tri4b/decode_bd_pp_tgpr_dev93.si/wer_16
-%WER 5.74 [ 324 / 5643, 47 ins, 35 del, 242 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92/wer_19
-%WER 7.92 [ 447 / 5643, 64 ins, 46 del, 337 sub ] exp/tri4b/decode_bd_pp_tgpr_eval92.si/wer_15
-%WER 9.38 [ 772 / 8234, 90 ins, 118 del, 564 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_18
-%WER 13.07 [ 1076 / 8234, 148 ins, 143 del, 785 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17
-%WER 6.03 [ 340 / 5643, 66 ins, 26 del, 248 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_13
-%WER 8.19 [ 462 / 5643, 74 ins, 42 del, 346 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15
-%WER 12.16 [ 1001 / 8234, 197 ins, 98 del, 706 sub ] exp/tri4b/decode_tgpr_dev93/wer_17
-%WER 15.47 [ 1274 / 8234, 235 ins, 120 del, 919 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17
-%WER 8.08 [ 456 / 5643, 125 ins, 16 del, 315 sub ] exp/tri4b/decode_tgpr_eval92/wer_13
-%WER 10.49 [ 592 / 5643, 147 ins, 27 del, 418 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_12
+%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg1/wer_15_0.5
+%WER 18.23 [ 1501 / 8234, 245 ins, 181 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg2/wer_15_0.5
+%WER 18.16 [ 1495 / 8234, 268 ins, 153 del, 1074 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg3/wer_16_0.0
+%WER 18.18 [ 1497 / 8234, 268 ins, 154 del, 1075 sub ] exp/tri1/decode_nosp_tgpr_dev93_tg4/wer_16_0.0
+
+
+# tri2b is an LDA+MLLT system trained on SI-84
+exp/tri2b: nj=10 align prob=-47.22 over 15.10h [retry=0.7%, fail=0.0%] states=2005 gauss=15036 tree-impr=5.45 lda-sum=26.20 mllt:impr,logdet=1.34,1.97
+%WER 16.37 [ 1348 / 8234, 241 ins, 157 del, 950 sub ] exp/tri2b/decode_nosp_tgpr_dev93/wer_17_0.0
+%WER 10.53 [ 594 / 5643, 110 ins, 60 del, 424 sub ] exp/tri2b/decode_nosp_tgpr_eval92/wer_17_0.5
+
+
+# tri3b is an LDA+MLLT+SAT system trained on all of SI-284
+exp/tri3b: nj=10 align prob=-44.30 over 81.23h [retry=0.8%, fail=0.1%] states=3362 gauss=40061 fmllr-impr=3.70 over 59.77h tree-impr=7.86
+
+%WER 15.56 [ 1281 / 8234, 220 ins, 140 del, 921 sub ] exp/tri3b/decode_nosp_tgpr_dev93.si/wer_17_0.5
+%WER 12.82 [ 1056 / 8234, 135 ins, 147 del, 774 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93.si/wer_15_0.0
+%WER 9.24 [ 761 / 8234, 89 ins, 109 del, 563 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93/wer_16_0.0
+%WER 11.53 [ 949 / 8234, 179 ins, 94 del, 676 sub ] exp/tri3b/decode_nosp_tgpr_dev93/wer_15_0.5
+%WER 10.94 [ 901 / 8234, 181 ins, 82 del, 638 sub ] exp/tri3b/decode_nosp_tg_dev93/wer_14_0.5
+%WER 8.16 [ 672 / 8234, 94 ins, 94 del, 484 sub ] exp/tri3b/decode_nosp_bd_tgpr_dev93_fg/wer_17_0.0
+
+%WER 10.95 [ 618 / 5643, 148 ins, 36 del, 434 sub ] exp/tri3b/decode_nosp_tgpr_eval92.si/wer_14_0.0
+%WER 8.19 [ 462 / 5643, 77 ins, 51 del, 334 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92.si/wer_16_0.0
+%WER 5.55 [ 313 / 5643, 35 ins, 45 del, 233 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92/wer_17_1.0
+%WER 4.89 [ 276 / 5643, 47 ins, 28 del, 201 sub ] exp/tri3b/decode_nosp_bd_tgpr_eval92_fg/wer_15_0.5
+%WER 7.53 [ 425 / 5643, 112 ins, 20 del, 293 sub ] exp/tri3b/decode_nosp_tg_eval92/wer_17_0.0
+%WER 8.15 [ 460 / 5643, 113 ins, 30 del, 317 sub ] exp/tri3b/decode_nosp_tgpr_eval92/wer_14_1.0
+
+
+# tri4b is an LDA+MLLT+SAT system after estimating pronunciation probabilities
+# and word-and-pronunciation-dependent silence probabilities.
+
+exp/tri4b: nj=10 align prob=-44.46 over 81.23h [retry=0.6%, fail=0.1%] states=3413 gauss=40059 fmllr-impr=0.17 over 60.20h tree-impr=8.70
+
+%WER 15.16 [ 1248 / 8234, 253 ins, 96 del, 899 sub ] exp/tri4b/decode_tgpr_dev93.si/wer_17_0.0
+%WER 12.62 [ 1039 / 8234, 141 ins, 124 del, 774 sub ] exp/tri4b/decode_bd_tgpr_dev93.si/wer_17_0.0
+%WER 9.01 [ 742 / 8234, 106 ins, 97 del, 539 sub ] exp/tri4b/decode_bd_tgpr_dev93/wer_16_0.0
+%WER 8.25 [ 679 / 8234, 94 ins, 100 del, 485 sub ] exp/tri4b/decode_bd_tgpr_dev93_fg/wer_17_0.5
+%WER 10.92 [ 899 / 8234, 186 ins, 92 del, 621 sub ] exp/tri4b/decode_tg_dev93/wer_17_0.5
+%WER 11.44 [ 942 / 8234, 203 ins, 87 del, 652 sub ] exp/tri4b/decode_tgpr_dev93/wer_14_0.5
+
+%WER 10.93 [ 617 / 5643, 147 ins, 33 del, 437 sub ] exp/tri4b/decode_tgpr_eval92.si/wer_14_1.0
+%WER 8.74 [ 493 / 5643, 104 ins, 34 del, 355 sub ] exp/tri4b/decode_bd_tgpr_eval92.si/wer_15_0.0
+%WER 5.69 [ 321 / 5643, 50 ins, 34 del, 237 sub ] exp/tri4b/decode_bd_tgpr_eval92/wer_17_0.5
+%WER 4.71 [ 266 / 5643, 40 ins, 27 del, 199 sub ] exp/tri4b/decode_bd_tgpr_eval92_fg/wer_17_1.0
+%WER 7.39 [ 417 / 5643, 107 ins, 24 del, 286 sub ] exp/tri4b/decode_tg_eval92/wer_16_1.0
+%WER 7.90 [ 446 / 5643, 111 ins, 27 del, 308 sub ] exp/tri4b/decode_tgpr_eval92/wer_15_1.0
+
+
+######################################
+## Results below this point were mostly obtained in 2013 by Hainan Xu,
+## They are from parts of the script that are now not run by default in the run.sh.
+## you can look in the git history to figure out when these results were added.
+
 %WER 7.99 [ 658 / 8234, 72 ins, 95 del, 491 sub ] exp/tri4b_fmmi_a/decode_bd_tgpr_dev93_it8/wer_12
 %WER 11.15 [ 918 / 8234, 180 ins, 81 del, 657 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it3/wer_15
 %WER 11.23 [ 925 / 8234, 201 ins, 77 del, 647 sub ] exp/tri4b_fmmi_a/decode_tgpr_dev93_it4/wer_12
@@ -166,7 +142,7 @@ exit 0
 # not updated
 
 
-# DNN on fMLLR features (Karel's setup, [7.8.2015]). 
+# DNN on fMLLR features (Karel's setup, [7.8.2015]).
 # frame cross-entropy training
 %WER 6.05 [ 498 / 8234, 59 ins, 67 del, 372 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_dev93/wer_11_0.0
 %WER 3.69 [ 208 / 5643, 19 ins, 19 del, 170 sub ] exp/dnn5b_pretrain-dbn_dnn/decode_bd_tgpr_eval92/wer_11_1.0
@@ -298,7 +274,7 @@ for x in exp/nnet3/nnet_tdnn_a/decode_*; do grep WER $x/wer_* | utils/best_wer.s
 
 # bidirectional LSTM
 # -----------------------
-# local/nnet3/run_lstm.sh --affix bidirectional \ 
+# local/nnet3/run_lstm.sh --affix bidirectional \
 #                         --lstm-delay " [-1,1] [-2,2] [-3,3] " \
 #	                  --label-delay 0 \
 #                         --cell-dim 640 \
diff --git a/egs/wsj/s5/local/chain/compare_wer.sh b/egs/wsj/s5/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..edfefad547f
--- /dev/null
+++ b/egs/wsj/s5/local/chain/compare_wer.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..d874eb0986a
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,361 @@
+#!/bin/bash
+
+
+# This was modified from run_tdnn_lstm_1a.sh, making similar
+# changes as the diff from run_tdnn_lstm_1a.sh->run_tdnn_1c.sh
+# in egs/tedlium/s5_r2/local/nnet3/tuning,
+# specifically:
+# changing chunk_left_context to zero, shrink from 0.99->1
+# (since it's not applicable to ReLUs), and removing
+# the deriv-truncate-margin option since it's only applicable
+# to recurrent setups; removing label-delay.
+# adding pre-final layers (I experimented with this,
+# it did seem helpful); using 3M not 1.5M frames per iter to keep the
+# time per job reasonable; and fewer final jobs (5 not 10).
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp: num-iters=102 nj=2..5 num-params=7.6M dim=40+100->2889 combine=-0.052->-0.051 xent:train/valid[67,101,final]=(-0.881,-0.824,-0.822/-0.953,-0.922,-0.921) logprob:train/valid[67,101,final]=(-0.048,-0.042,-0.041/-0.064,-0.064,-0.063)
+
+# The following table compares (nnet3 TDNN, chain TDNN+LSTM, this experiment == chain TDNN).
+# This is better than the nnet3 TDNN, but the difference with the chain TDNN+LSTM
+# is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn1a_sp exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp tdnn1a_sp
+#WER dev93 (tgpr)                9.18      7.48      7.87
+#             [online:]                    7.49      8.02
+#WER dev93 (tg)                  8.59      7.41      7.61
+#             [online:]                    7.40      7.70
+#WER dev93 (big-dict,tgpr)       6.45      5.64      5.71
+#             [online:]                    5.70      5.60
+#WER dev93 (big-dict,fg)         5.83      5.40      5.10
+#             [online:]                    5.19      5.21
+#WER eval92 (tgpr)               6.15      5.67      5.23
+#             [online:]                    5.60      5.44
+#WER eval92 (tg)                 5.55      5.46      4.87
+#             [online:]                    5.53      4.87
+#WER eval92 (big-dict,tgpr)      3.58      3.69      3.24
+#             [online:]                    3.63      3.31
+#WER eval92 (big-dict,fg)        2.98      3.28      2.71
+#             [online:]                    3.31      2.92
+# Final train prob                  -0.0341   -0.0414
+# Final valid prob                  -0.0506   -0.0634
+# Final train prob (xent)             -0.5643   -0.8216
+# Final valid prob (xent)             -0.6648   -0.9208
+
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1d  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-renorm-layer name=prefinal-chain dim=512 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-renorm-layer name=prefinal-xent input=tdnn6 dim=512 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..4b752a55a4b
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,393 @@
+#!/bin/bash
+
+
+# this is a TDNN+LSTM chain system.
+# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with
+# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh.
+# Note: we're using the same hidden-layer sizes as
+# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the
+# fact that we'd normally choose a smaller model for a setup with
+# less data, because the Tedlium model was probably on the small side.
+# Note: we normally use more parameters for LSTM-containing than TDNN-only
+# systems.
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051)
+
+# The following compares:
+# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM)
+# system.
+# This is consistently better than the nnet3 TDNN+LSTM, but the
+# difference with the chain TDNN is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp
+# System                tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                8.54      7.87      7.48
+#             [online:]          8.57      8.02      7.49
+#WER dev93 (tg)                  8.25      7.61      7.41
+#             [online:]          8.34      7.70      7.40
+#WER dev93 (big-dict,tgpr)       6.24      5.71      5.64
+#             [online:]          6.40      5.60      5.70
+#WER dev93 (big-dict,fg)         5.70      5.10      5.40
+#             [online:]          5.77      5.21      5.19
+#WER eval92 (tgpr)               6.52      5.23      5.67
+#             [online:]          6.56      5.44      5.60
+#WER eval92 (tg)                 6.13      4.87      5.46
+#             [online:]          6.24      4.87      5.53
+#WER eval92 (big-dict,tgpr)      3.88      3.24      3.69
+#             [online:]          3.88      3.31      3.63
+#WER eval92 (big-dict,fg)        3.38      2.71      3.28
+#             [online:]          3.53      2.92      3.31
+# Final train prob                  -0.0414   -0.0341
+# Final valid prob                  -0.0634   -0.0506
+# Final train prob (xent)             -0.8216   -0.5643
+# Final valid prob (xent)             -0.9208   -0.6648
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+label_delay=5
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=512
+  relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=512 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=512 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 19 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 20 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet2/run_5b_gpu.sh b/egs/wsj/s5/local/nnet2/run_5b_gpu.sh
index 2dc5afa0e87..c1faf3e5d4f 100755
--- a/egs/wsj/s5/local/nnet2/run_5b_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5b_gpu.sh
@@ -65,7 +65,7 @@ if [ $stage -le 2 ]; then
 
   steps/nnet2/train_block.sh --stage "$train_stage" \
      --num-threads 1 --max-change 40.0 --minibatch-size 512 --num-jobs-nnet 8 \
-     --parallel-opts "-l gpu=1" \
+     --parallel-opts "--gpu 1" \
      --initial-learning-rate 0.0075 --final-learning-rate 0.00075 \
      --num-epochs 10 --num-epochs-extra 5 \
      --cmd "$decode_cmd" \
diff --git a/egs/wsj/s5/local/nnet2/run_5c.sh b/egs/wsj/s5/local/nnet2/run_5c.sh
index e33546572ad..e8df3e8a2e9 100755
--- a/egs/wsj/s5/local/nnet2/run_5c.sh
+++ b/egs/wsj/s5/local/nnet2/run_5c.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # This is neural net training on top of adapted 40-dimensional features.
-# 
+#
 
 train_stage=-10
 use_gpu=true
@@ -13,19 +13,19 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet5c_gpu
 else
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet5c
   minibatch_size=128
 fi
@@ -48,7 +48,7 @@ if [ ! -f $dir/final.mdl ]; then
     --cmd "$decode_cmd" \
      data/train_si284 data/lang exp/tri4b_ali_si284 $dir || exit 1
 fi
-  
+
 
 steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
   --transform-dir exp/tri4b/decode_bd_tgpr_dev93 \
diff --git a/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh b/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh
index 83096b0c38c..3d4b070ceda 100755
--- a/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5c2_gpu.sh
@@ -9,7 +9,7 @@
 
 train_stage=-100
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5c2_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
diff --git a/egs/wsj/s5/local/nnet2/run_5d.sh b/egs/wsj/s5/local/nnet2/run_5d.sh
index 0206fba2159..050ac29f5a3 100755
--- a/egs/wsj/s5/local/nnet2/run_5d.sh
+++ b/egs/wsj/s5/local/nnet2/run_5d.sh
@@ -13,13 +13,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp/nnet5d_gpu
@@ -27,7 +27,7 @@ else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp/nnet5d
 fi
diff --git a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
index bae7327788e..b727acb905e 100755
--- a/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_5e_gpu.sh
@@ -8,7 +8,7 @@ final_beta=5
 
 train_stage=-10
 temp_dir=  # e.g. --temp-dir /export/m1-02/dpovey/kaldi-dan2/egs/wsj/s5/
-parallel_opts="-l gpu=1,hostname=g*"  # This is suitable for the CLSP network, you'll likely have to change it.
+parallel_opts="--gpu 1"  # This is suitable for the CLSP network, you'll likely have to change it.
 dir=exp/nnet5e_gpu
 
 # Note: since we multiplied the num-jobs by 1/4, we halved the
diff --git a/egs/wsj/s5/local/nnet2/run_6c_gpu.sh b/egs/wsj/s5/local/nnet2/run_6c_gpu.sh
index ef3ca4d3e0b..ddd2d3e2a86 100755
--- a/egs/wsj/s5/local/nnet2/run_6c_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_6c_gpu.sh
@@ -7,21 +7,21 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1"                   # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"                   # This is suitable for the CLSP network,
                                       # you'll likely have to change it.  we'll
                                       # use it later on, in the training (it's
                                       # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
 
 # The denominator lattice creation currently doesn't use GPUs.
 
-# Note: we specify 1G each for the mem_free and ram_free which, is per
+# Note: we specify 1G for --mem, which is per
 # thread... it will likely be less than the default.  Increase the beam relative
 # to the defaults; this is just for this RM setup, where the default beams will
 # likely generate very thin lattices.  Note: the transform-dir is important to
@@ -31,8 +31,8 @@ set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5c_gpu exp/nnet5c_gpu_denlats
 
diff --git a/egs/wsj/s5/local/nnet2/run_6d.sh b/egs/wsj/s5/local/nnet2/run_6d.sh
index 609bd4aa5bd..c208404e7e5 100755
--- a/egs/wsj/s5/local/nnet2/run_6d.sh
+++ b/egs/wsj/s5/local/nnet2/run_6d.sh
@@ -2,15 +2,15 @@
 
 
 # This script demonstrates discriminative training of p-norm neural nets.  It's on top
-# of run_5d_gpu.sh, which uses adapted 40-dimensional features. 
+# of run_5d_gpu.sh, which uses adapted 40-dimensional features.
 
 
 set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5d exp/nnet5d_denlats
 
diff --git a/egs/wsj/s5/local/nnet2/run_6d_gpu.sh b/egs/wsj/s5/local/nnet2/run_6d_gpu.sh
index 199597dab81..3ee2ecb53a3 100755
--- a/egs/wsj/s5/local/nnet2/run_6d_gpu.sh
+++ b/egs/wsj/s5/local/nnet2/run_6d_gpu.sh
@@ -7,14 +7,14 @@
 # directory name.
 
 
-gpu_opts="-l gpu=1"                # This is suitable for the CLSP network,
+gpu_opts="--gpu 1"                # This is suitable for the CLSP network,
                                    # you'll likely have to change it.  we'll
                                    # use it later on, in the training (it's
                                    # not used in denlat creation)
 . ./cmd.sh
 . ./path.sh
-! cuda-compiled && cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+! cuda-compiled && cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
@@ -25,8 +25,8 @@ set -e # exit on error.
 
 nj=$(cat exp/tri4b_ali_si284/num_jobs)
 
-steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
+steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 20 --num-threads 6 --parallel-opts "--num-threads 6" \
       --transform-dir exp/tri4b_ali_si284 \
      data/train_si284 data/lang exp/nnet5d_gpu exp/nnet5d_gpu_denlats
 
diff --git a/egs/wsj/s5/local/nnet2/run_bnf.sh b/egs/wsj/s5/local/nnet2/run_bnf.sh
index 1c00267c480..245bb1c0bcf 100644
--- a/egs/wsj/s5/local/nnet2/run_bnf.sh
+++ b/egs/wsj/s5/local/nnet2/run_bnf.sh
@@ -12,25 +12,25 @@ set -u
 use_gpu=true
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
 else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
 fi
 
 bnf_train_stage=-100
-align_dir=exp/tri4b_ali_si284 
+align_dir=exp/tri4b_ali_si284
 if [ ! -f exp_bnf/tri6_bnf/.done ]; then
   mkdir -p exp_bnf
   mkdir -p exp_bnf/tri6_bnf
@@ -41,12 +41,12 @@ if [ ! -f exp_bnf/tri6_bnf/.done ]; then
     --stage $bnf_train_stage --num-jobs-nnet 4 \
     --num-threads $num_threads --mix-up 5000 --max-change 40 \
     --minibatch-size $minibatch_size --parallel-opts "$parallel_opts" \
-    --cmd "$train_cmd -l mem_free=2G,ram_free=2G"\
+    --cmd "$train_cmd --mem 2G"\
     --initial-learning-rate 0.005 \
     --final-learning-rate 0.0005 \
     --num-hidden-layers 5 \
     --bottleneck-dim 42 --hidden-layer-dim 1024 \
-    data/train_si284 data/lang $align_dir exp_bnf/tri6_bnf || exit 1 
+    data/train_si284 data/lang $align_dir exp_bnf/tri6_bnf || exit 1
   touch exp_bnf/tri6_bnf/.done
 fi
 
@@ -57,7 +57,7 @@ if [ ! -f data_bnf/train_bnf/.done ]; then
   steps/nnet2/dump_bottleneck_features.sh --cmd "$train_cmd" \
     --transform-dir exp/tri4a  data/train_si284 data_bnf/train_bnf exp_bnf/tri6_bnf param_bnf exp_bnf/dump_bnf
   touch data_bnf/train_bnf/.done
-fi 
+fi
 
 [ ! -d data/test_eval92 ] && echo "No such directory data/test_eval92" && exit 1;
 [ ! -d data/test_dev93 ] && echo "No such directory data/test_dev93" && exit 1;
@@ -75,11 +75,11 @@ steps/nnet2/dump_bottleneck_features.sh --nj 10 \
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
   steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
-    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
+    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/
 
   steps/append_feats.sh --cmd "$train_cmd" --nj 4 \
     data_bnf/train_bnf data_bnf/train_sat data_bnf/train \
-    exp_bnf/append_feats/log param_bnf/ 
+    exp_bnf/append_feats/log param_bnf/
   steps/compute_cmvn_stats.sh --fake data_bnf/train exp_bnf/make_fmllr_feats param_bnf
   rm -r data_bnf/train_sat
 
@@ -88,18 +88,18 @@ fi
 ## preparing Bottleneck features for eval92 and dev93
 steps/nnet/make_fmllr_feats.sh \
   --nj 8 --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data_bnf/eval92_sat data/test_eval92 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 steps/nnet/make_fmllr_feats.sh \
   --nj 10 --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data_bnf/dev93_sat data/test_dev93 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 
 steps/append_feats.sh --nj 4 \
   data_bnf/eval92_bnf data_bnf/eval92_sat data_bnf/eval92 \
-  exp_bnf/append_feats/log param_bnf/ 
+  exp_bnf/append_feats/log param_bnf/
 steps/append_feats.sh --nj 4 \
   data_bnf/dev93_bnf data_bnf/dev93_sat data_bnf/dev93 \
-  exp_bnf/append_feats/log param_bnf/ 
-  
+  exp_bnf/append_feats/log param_bnf/
+
 steps/compute_cmvn_stats.sh --fake data_bnf/eval92 exp_bnf/make_fmllr_feats param_bnf
 steps/compute_cmvn_stats.sh --fake data_bnf/dev93 exp_bnf/make_fmllr_feats param_bnf
 
diff --git a/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh b/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
index e9d573f01a3..861e993774b 100755
--- a/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
+++ b/egs/wsj/s5/local/nnet2/run_pnorm_bnf.sh
@@ -13,13 +13,13 @@ set -u
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   dir=exp_bnf/tri6_bnf_gpu
@@ -27,13 +27,13 @@ else
   # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
   # almost the same, but this may be a little bit slow.
   num_threads=16
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   minibatch_size=128
   dir=exp_bnf/tri6_bnf
 fi
 
 
-align_dir=exp/tri4b_ali_si284 
+align_dir=exp/tri4b_ali_si284
 mkdir -p exp_bnf
 mkdir -p $dir
 
@@ -52,7 +52,7 @@ if [ ! -f $dir/.done ]; then
     --num-jobs-nnet 8 --mix-up 8000 \
     --num-hidden-layers 4 \
     --bottleneck-dim 42 --cmd "$train_cmd" \
-    data/train_si284 data/lang $align_dir $dir || exit 1 
+    data/train_si284 data/lang $align_dir $dir || exit 1
   touch $dir/.done
 fi
 
@@ -63,7 +63,7 @@ if [ ! -f data_bnf/train_bnf/.done ]; then
   steps/nnet2/dump_bottleneck_features.sh --cmd "$train_cmd" \
     --transform-dir exp/tri4a  data/train_si284 data_bnf/train_bnf $dir param_bnf exp_bnf/dump_bnf
   touch data_bnf/train_bnf/.done
-fi 
+fi
 
 [ ! -d data/test_eval92 ] && echo "No such directory data/test_eval92" && exit 1;
 [ ! -d data/test_dev93 ] && echo "No such directory data/test_dev93" && exit 1;
@@ -81,11 +81,11 @@ steps/nnet2/dump_bottleneck_features.sh --nj 10 \
 if [ ! data_bnf/train/.done -nt data_bnf/train_bnf/.done ]; then
   steps/nnet/make_fmllr_feats.sh --cmd "$train_cmd --max-jobs-run 10" \
      --transform-dir $align_dir  data_bnf/train_sat data/train_si284 \
-    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/ 
+    exp/tri4b exp_bnf/make_fmllr_feats/log param_bnf/
 
   steps/append_feats.sh --cmd "$train_cmd" --nj 4 \
     data_bnf/train_bnf data_bnf/train_sat data_bnf/train \
-    exp_bnf/append_feats/log param_bnf/ 
+    exp_bnf/append_feats/log param_bnf/
   steps/compute_cmvn_stats.sh --fake data_bnf/train exp_bnf/make_fmllr_feats param_bnf
   rm -r data_bnf/train_sat
 
@@ -94,18 +94,18 @@ fi
 ## preparing Bottleneck features for eval92 and dev93
 steps/nnet/make_fmllr_feats.sh \
   --nj 8 --transform-dir exp/tri4b/decode_bd_tgpr_eval92 data_bnf/eval92_sat data/test_eval92 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 steps/nnet/make_fmllr_feats.sh \
   --nj 10 --transform-dir exp/tri4b/decode_bd_tgpr_dev93 data_bnf/dev93_sat data/test_dev93 \
-  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/ 
+  $align_dir exp_bnf/make_fmllr_feats/log param_bnf/
 
 steps/append_feats.sh --nj 4 \
   data_bnf/eval92_bnf data_bnf/eval92_sat data_bnf/eval92 \
-  exp_bnf/append_feats/log param_bnf/ 
+  exp_bnf/append_feats/log param_bnf/
 steps/append_feats.sh --nj 4 \
   data_bnf/dev93_bnf data_bnf/dev93_sat data_bnf/dev93 \
-  exp_bnf/append_feats/log param_bnf/ 
-  
+  exp_bnf/append_feats/log param_bnf/
+
 steps/compute_cmvn_stats.sh --fake data_bnf/eval92 exp_bnf/make_fmllr_feats param_bnf
 steps/compute_cmvn_stats.sh --fake data_bnf/dev93 exp_bnf/make_fmllr_feats param_bnf
 
diff --git a/egs/wsj/s5/local/nnet3/compare_wer.sh b/egs/wsj/s5/local/nnet3/compare_wer.sh
new file mode 100755
index 00000000000..7a2fbd8a123
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/compare_wer.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/nnet3/compare_wer.sh exp/nnet3/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn_c_sp exp/nnet3/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/nnet3/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/nnet3/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}$epoch_infix/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}$epoch_infix/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}$epoch_infix/scoring_kaldi/best_wer | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep log-like | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid acc      "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.{final,combined}.log 2>/dev/null | grep accuracy | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo
diff --git a/egs/wsj/s5/local/nnet3/run_ivector_common.sh b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
index 8d4cff326b3..e30988b7bf6 100755
--- a/egs/wsj/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/wsj/s5/local/nnet3/run_ivector_common.sh
@@ -1,83 +1,215 @@
 #!/bin/bash
 
-# this script is called from scripts like run_ms.sh; it does the common stages
-# of the build, such as feature extraction.
-# This is actually the same as local/online/run_nnet2_common.sh, except
-# for the directory names.
+set -e -o pipefail
 
-. cmd.sh
-mfccdir=mfcc
+# This script is called from scripts like local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more scripts).  It
+# contains the common feature preparation and iVector-related parts of the
+# script.  See those scripts for examples of usage.
 
-stage=1
 
-. cmd.sh
+stage=0
+nj=30
+train_set=train_si284   # you might set this to e.g. train.
+test_sets="test_dev93 test_eval92"
+gmm=tri4b                # This specifies a GMM-dir from the features of the type you're training the system on;
+                         # it should contain alignments for 'train_set'.
+
+num_threads_ubm=32
+nnet3_affix=             # affix for exp/nnet3 directory to put iVector stuff in (e.g.
+                         # in the tedlium recip it's _cleaned).
+
+. ./cmd.sh
 . ./path.sh
-. ./utils/parse_options.sh
+. utils/parse_options.sh
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+
+if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then
+  echo "$0: data/${train_set}_sp_hires/feats.scp already exists."
+  echo " ... Please either remove it, or rerun this script with stage > 2."
+  exit 1
+fi
 
 
 if [ $stage -le 1 ]; then
-  for datadir in train_si284 test_eval93 test_dev93 test_eval92; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-    steps/make_mfcc.sh --nj 40 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires exp/make_hires/$datadir $mfccdir || exit 1;
-  done
-  utils/subset_data_dir.sh --first data/train_si284_hires 7138 data/train_si84_hires || exit 1
+  echo "$0: preparing directory for speed-perturbed data"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We align the si84 data for this purpose.
+  echo "$0: creating high-resolution MFCC features"
+
+  # this shows how you can split across multiple file-systems.  we'll split the
+  # MFCC dir across multiple locations.  You might want to be careful here, if you
+  # have multiple copies of Kaldi checked out and run the same recipe, not to let
+  # them overwrite each other.
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires
 
-  steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \
-    data/train_si84 data/lang exp/tri4b exp/nnet3/tri4b_ali_si84
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires
+    steps/compute_cmvn_stats.sh data/${datadir}_hires
+    utils/fix_data_dir.sh data/${datadir}_hires
+  done
 fi
 
 if [ $stage -le 3 ]; then
-  # Train a small system just for its LDA+MLLT transform.  We use --num-iters 13
-  # because after we get the transform (12th iter is the last), any further
-  # training is pointless.
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
-    --realign-iters "" \
-    --splice-opts "--left-context=3 --right-context=3" \
-    5000 10000 data/train_si84_hires data/lang \
-     exp/nnet3/tri4b_ali_si84 exp/nnet3/tri5b
+  echo "$0: selecting segments of hires training data that were also present in the"
+  echo " ... original training data."
+
+  # note, these data-dirs are temporary; we put them in a sub-directory
+  # of the place where we'll make the alignments.
+  temp_data_root=exp/nnet3${nnet3_affix}/tri5
+  mkdir -p $temp_data_root
+
+  utils/data/subset_data_dir.sh --utt-list data/${train_set}/feats.scp \
+    data/${train_set}_sp_hires $temp_data_root/${train_set}_hires
+
+  # note: essentially all the original segments should be in the hires data.
+  n1=$(wc -l <data/${train_set}/feats.scp)
+  n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
+  if [ $n1 != $n1 ]; then
+    echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
+    sleep 5
+  fi
+
+  echo "$0: training a system on the hires data for its LDA+MLLT transform, in order to produce the diagonal GMM."
+  if [ -e exp/nnet3${nnet3_affix}/tri5/final.mdl ]; then
+    # we don't want to overwrite old stuff, ask the user to delete it.
+    echo "$0: exp/nnet3${nnet3_affix}/tri5/final.mdl already exists: "
+    echo " ... please delete and then rerun, or use a later --stage option."
+    exit 1;
+  fi
+  # we limit the number of iterations because it's only the LDA+MLLT transform
+  # that we need.
+  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 7 --mllt-iters "2 4 6" \
+     --splice-opts "--left-context=3 --right-context=3" \
+     3000 10000 $temp_data_root/${train_set}_hires data/lang \
+      $gmm_dir exp/nnet3${nnet3_affix}/tri5
 fi
 
+
 if [ $stage -le 4 ]; then
-  mkdir -p exp/nnet3
+  echo "$0: computing a subset of data to train the diagonal UBM."
+
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  # train a diagonal UBM using a subset of about a quarter of the data
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
-     --num-frames 400000 data/train_si84_hires 256 exp/nnet3/tri5b exp/nnet3/diag_ubm
+    --num-frames 700000 \
+    --num-threads $num_threads_ubm \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/tri5 exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 5 ]; then
-  # even though $nj is just 10, each job uses multiple processes and threads.
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
-    data/train_si284_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1;
+    data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 if [ $stage -le 6 ]; then
-  # We extract iVectors on all the train_si284 data, which will be what we
-  # train the system on.
-
-  # having a larger number of speakers is helpful for generalization, and to
-  # handle per-utterance decoding well (iVector starts at zero).
-  steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_si284_hires \
-    data/train_si284_hires_max2
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data; the utterance list is the same.
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+  # We now extract iVectors on the speed-perturbed training data .  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online' (they vary within the utterance).
+
+  # Having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (the iVector starts at zero at the beginning
+  # of each pseudo-speaker).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in ${test_sets}; do
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj "${nspk}" \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
 
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
-    data/train_si284_hires_max2 exp/nnet3/extractor exp/nnet3/ivectors_train_si284 || exit 1;
+if [ -f data/${train_set}_sp/feats.scp ] && [ $stage -le 8 ]; then
+  echo "$0: $feats already exists.  Refusing to overwrite the features "
+  echo " to avoid wasting time.  Please remove the file and continue if you really mean this."
+  exit 1;
 fi
 
+
 if [ $stage -le 7 ]; then
-  rm exp/nnet3/.error 2>/dev/null
-  for data in test_eval92 test_dev93 test_eval93; do
-    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 8 \
-      data/${data}_hires exp/nnet3/extractor exp/nnet3/ivectors_${data} || touch exp/nnet3/.error &
-  done
-  wait
-  [ -f exp/nnet3/.error ] && echo "$0: error extracting iVectors." && exit 1;
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh \
+    data/${train_set} data/${train_set}_sp
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: making MFCC features for low-resolution speed-perturbed data (needed for alignments)"
+  steps/make_mfcc.sh --nj $nj \
+    --cmd "$train_cmd" data/${train_set}_sp
+  steps/compute_cmvn_stats.sh data/${train_set}_sp
+  echo "$0: fixing input data-dir to remove nonexistent features, in case some "
+  echo ".. speed-perturbed segments were too short."
+  utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
+if [ $stage -le 9 ]; then
+  if [ -f $ali_dir/ali.1.gz ]; then
+    echo "$0: alignments in $ali_dir appear to already exist.  Please either remove them "
+    echo " ... or use a later --stage option."
+    exit 1
+  fi
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir
+fi
+
+
 exit 0;
diff --git a/egs/wsj/s5/local/nnet3/run_lstm.sh b/egs/wsj/s5/local/nnet3/run_lstm.sh
index 2454fb5be63..d9af546b49b 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# This script is deprecated, see run_tdnn_lstm.sh
+
 # this is a basic lstm script
 # LSTM script runs for more epochs than the TDNN script
 # and each epoch takes twice the time
@@ -125,4 +127,3 @@ if [ $stage -le 9 ]; then
 fi
 
 exit 0;
-
diff --git a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
index 0b6d7bb3970..311ee14d16a 100755
--- a/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_lstm_discriminative.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+
+# This script is deprecated.
+
 set -o pipefail
 set -e
 # this is run_discriminative.sh
@@ -8,7 +11,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -46,26 +49,22 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true
-modify_learning_rates=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -104,7 +103,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -117,8 +116,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -128,7 +127,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -141,15 +140,12 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -162,9 +158,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 if [ $stage -le 5 ]; then
@@ -194,4 +188,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
deleted file mode 100755
index 337c5656de4..00000000000
--- a/egs/wsj/s5/local/nnet3/run_tdnn.sh
+++ /dev/null
@@ -1,74 +0,0 @@
-#!/bin/bash
-
-# this is the standard "tdnn" system, built in nnet3; it's what we use to
-# call multi-splice.
-
-. cmd.sh
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-stage=0
-train_stage=-10
-dir=exp/nnet3/nnet_tdnn_a
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
-    --splice-indexes "-4,-3,-2,-1,0,1,2,3,4  0  -2,2  0  -4,4 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
-    --cmd "$decode_cmd" \
-    --pnorm-input-dim 2000 \
-    --pnorm-output-dim 250 \
-    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
-fi
-
-
-if [ $stage -le 9 ]; then
-  # this does offline decoding that should give the same results as the real
-  # online decoding.
-  for lm_suffix in tgpr bd_tgpr; do
-    graph_dir=exp/tri4b/graph_${lm_suffix}
-    # use already-built graphs.
-    for year in eval92 dev93; do
-      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3/ivectors_test_$year \
-         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
-    done
-  done
-fi
-
-
-exit 0;
-
-# results:
-grep WER exp/nnet3/nnet_tdnn_a/decode_{tgpr,bd_tgpr}_{eval92,dev93}/scoring_kaldi/best_wer
-exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/scoring_kaldi/best_wer:%WER 6.03 [ 340 / 5643, 74 ins, 20 del, 246 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_eval92/wer_13_1.0
-exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/scoring_kaldi/best_wer:%WER 9.35 [ 770 / 8234, 162 ins, 84 del, 524 sub ] exp/nnet3/nnet_tdnn_a/decode_tgpr_dev93/wer_11_0.5
-exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/scoring_kaldi/best_wer:%WER 3.81 [ 215 / 5643, 30 ins, 18 del, 167 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_eval92/wer_10_1.0
-exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/scoring_kaldi/best_wer:%WER 6.74 [ 555 / 8234, 69 ins, 72 del, 414 sub ] exp/nnet3/nnet_tdnn_a/decode_bd_tgpr_dev93/wer_11_0.0
-b03:s5:
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn.sh b/egs/wsj/s5/local/nnet3/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh b/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
deleted file mode 100755
index aefbcdd331b..00000000000
--- a/egs/wsj/s5/local/nnet3/run_tdnn_baseline.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-
-
-# This version of the TDNN system is being built to have a similar configuration
-# to the one in local/online/run_nnet2.sh, for better comparability.
-
-. cmd.sh
-
-
-# At this script level we don't support not running on GPU, as it would be painfully slow.
-# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
-# --num-threads 16 and --minibatch-size 128.
-
-stage=0
-train_stage=-10
-dir=exp/nnet3/nnet_tdnn_c
-. cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-local/nnet3/run_ivector_common.sh --stage $stage || exit 1;
-
-if [ $stage -le 8 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/train_tdnn.sh --stage $train_stage \
-    --num-epochs 8 --num-jobs-initial 2 --num-jobs-final 14 \
-    --splice-indexes "-1,0,1  -2,1  -4,2 0" \
-    --feat-type raw \
-    --online-ivector-dir exp/nnet3/ivectors_train_si284 \
-    --cmvn-opts "--norm-means=false --norm-vars=false" \
-    --io-opts "--max-jobs-run 12" \
-    --initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
-    --cmd "$decode_cmd" \
-    --pnorm-input-dim 2000 \
-    --pnorm-output-dim 250 \
-    data/train_si284_hires data/lang exp/tri4b_ali_si284 $dir  || exit 1;
-fi
-
-
-if [ $stage -le 9 ]; then
-  # this does offline decoding that should give the same results as the real
-  # online decoding.
-  for lm_suffix in tgpr bd_tgpr; do
-    graph_dir=exp/tri4b/graph_${lm_suffix}
-    # use already-built graphs.
-    for year in eval92 dev93; do
-      steps/nnet3/decode.sh --nj 8 --cmd "$decode_cmd" \
-          --online-ivector-dir exp/nnet3/ivectors_test_$year \
-         $graph_dir data/test_${year}_hires $dir/decode_${lm_suffix}_${year} || exit 1;
-    done
-  done
-fi
-
-# The following results compare this nnet3 decode with the matched nnet2 baseline.
-#
-#b06:s5: cat exp/nnet3/nnet_tdnn_c/decode_*/scoring_kaldi/best_wer 
-#%WER 6.40 [ 527 / 8234, 59 ins, 71 del, 397 sub ] exp/nnet3/nnet_tdnn_c/decode_bd_tgpr_dev93/wer_10_0.0
-#%WER 3.54 [ 200 / 5643, 18 ins, 17 del, 165 sub ] exp/nnet3/nnet_tdnn_c/decode_bd_tgpr_eval92/wer_10_1.0
-#%WER 9.11 [ 750 / 8234, 140 ins, 83 del, 527 sub ] exp/nnet3/nnet_tdnn_c/decode_tgpr_dev93/wer_10_1.0
-#%WER 6.22 [ 351 / 5643, 85 ins, 15 del, 251 sub ] exp/nnet3/nnet_tdnn_c/decode_tgpr_eval92/wer_10_1.0
-#b06:s5: 
-#b06:s5: cat exp/nnet2_online/nnet_ms_a/decode_*/scoring_kaldi/best_wer 
-#%WER 6.62 [ 545 / 8234, 56 ins, 79 del, 410 sub ] exp/nnet2_online/nnet_ms_a/decode_bd_tgpr_dev93/wer_13_0.0
-#%WER 3.70 [ 209 / 5643, 25 ins, 18 del, 166 sub ] exp/nnet2_online/nnet_ms_a/decode_bd_tgpr_eval92/wer_13_0.5
-#%WER 9.33 [ 768 / 8234, 157 ins, 73 del, 538 sub ] exp/nnet2_online/nnet_ms_a/decode_tgpr_dev93/wer_11_0.5
-#%WER 6.11 [ 345 / 5643, 92 ins, 14 del, 239 sub ] exp/nnet2_online/nnet_ms_a/decode_tgpr_eval92/wer_10_1.0
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
index a514e354eef..01e1476befb 100755
--- a/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_discriminative.sh
@@ -8,7 +8,7 @@ set -e
 # note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
 # since the lattice generation runs in about real-time, so takes of the order of
 # 1000 hours of CPU time.
-# 
+#
 . cmd.sh
 
 
@@ -38,26 +38,22 @@ dir=${srcdir}_${criterion}
 ## Egs options
 frames_per_eg=150
 frames_overlap_per_eg=30
-truncate_deriv_weights=10
 
 ## Nnet training options
 effective_learning_rate=0.0000125
 max_param_change=1
 num_jobs_nnet=4
 num_epochs=4
-regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options 
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options
 minibatch_size=64
-adjust_priors=true
-modify_learning_rates=true
-last_layer_factor=0.1
 
 ## Decode options
 decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
@@ -88,7 +84,7 @@ fi
 if [ -z "$lats_dir" ]; then
   lats_dir=${srcdir}_denlats
   if [ $stage -le 2 ]; then
-    nj=50  
+    nj=50
     # this doesn't really affect anything strongly, except the num-jobs for one of
     # the phases of get_egs_discriminative.sh below.
     num_threads_denlats=6
@@ -101,8 +97,8 @@ if [ -z "$lats_dir" ]; then
   fi
 fi
 
-model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'` 
-model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'` 
+model_left_context=`nnet3-am-info $srcdir/final.mdl | grep "left-context:" | awk '{print $2}'`
+model_right_context=`nnet3-am-info $srcdir/final.mdl | grep "right-context:" | awk '{print $2}'`
 
 left_context=$[model_left_context + extra_left_context]
 right_context=$[model_right_context + extra_right_context]
@@ -112,7 +108,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
   frame_subsampling_opt="--frame-subsampling-factor $(cat $srcdir/frame_subsampling_factor)"
 fi
 
-cmvn_opts=`cat $srcdir/cmvn_opts` 
+cmvn_opts=`cat $srcdir/cmvn_opts`
 
 if [ -z "$degs_dir" ]; then
   degs_dir=${srcdir}_degs
@@ -125,15 +121,12 @@ if [ -z "$degs_dir" ]; then
     # have a higher maximum num-jobs if
     if [ -d ${srcdir}_degs/storage ]; then max_jobs=10; else max_jobs=5; fi
 
-    degs_opts="--determinize true --minimize true --remove-output-symbols true --remove-epsilons true --collapse-transition-ids true"
-
     steps/nnet3/get_egs_discriminative.sh \
       --cmd "$decode_cmd --max-jobs-run $max_jobs --mem 20G" --stage $get_egs_stage --cmvn-opts "$cmvn_opts" \
-      --adjust-priors $adjust_priors \
       --online-ivector-dir $online_ivector_dir \
       --left-context $left_context --right-context $right_context \
       $frame_subsampling_opt \
-      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg ${degs_opts} \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
       $train_data_dir data/lang ${srcdir}_ali $lats_dir $srcdir/final.mdl $degs_dir ;
   fi
 fi
@@ -146,9 +139,7 @@ if [ $stage -le 4 ]; then
     --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size $minibatch_size \
     --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
     --regularization-opts "$regularization_opts" \
-    --truncate-deriv-weights $truncate_deriv_weights --adjust-priors $adjust_priors \
-    --modify-learning-rates $modify_learning_rates --last-layer-factor $last_layer_factor \
-    ${degs_dir} $dir 
+    ${degs_dir} $dir
 fi
 
 if [ $stage -le 5 ]; then
@@ -178,4 +169,3 @@ fi
 
 
 exit 0;
-
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh
new file mode 120000
index 00000000000..50d28fb91f3
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_disc.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a_disc.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
new file mode 120000
index 00000000000..8e03c924bc1
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/run_tdnn_lstm_lfr.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_lfr_1a.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..078719b1114
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1a.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+
+#    This is the standard "tdnn" system, built in nnet3 with xconfigs.
+
+
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp
+# System                tdnn1a_sp
+#WER dev93 (tgpr)                9.18
+#WER dev93 (tg)                  8.59
+#WER dev93 (big-dict,tgpr)       6.45
+#WER dev93 (big-dict,fg)         5.83
+#WER eval92 (tgpr)               6.15
+#WER eval92 (tg)                 5.55
+#WER eval92 (big-dict,tgpr)      3.58
+#WER eval92 (big-dict,fg)        2.98
+# Final train prob        -0.7200
+# Final valid prob        -0.8834
+# Final train acc          0.7762
+# Final valid acc          0.7301
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+tdnn_affix=1a  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=650
+  relu-renorm-layer name=tdnn2 dim=650 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn3 dim=650 input=Append(-1,0,1)
+  relu-renorm-layer name=tdnn4 dim=650 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn5 dim=650 input=Append(-6,-3,0)
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          ${graph_dir} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..61223b8a135
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_1b.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+# 1b is like 1a, but using a different splicing setup; the difference
+# is like the difference from
+# egs/tedlium/s5_r2/local/nnet3/tuning/run_tdnn{c->b}.sh
+# There seems to be no consistent difference.
+
+#  run_tdnn_1a.sh is the standard "tdnn" system, built in nnet3 with xconfigs.
+
+# local/nnet3/compare_wer.sh exp/nnet3/tdnn1a_sp exp/nnet3/tdnn1b_sp
+# System                tdnn1a_sp tdnn1b_sp
+#WER dev93 (tgpr)                9.18      9.12
+#WER dev93 (tg)                  8.59      8.51
+#WER dev93 (big-dict,tgpr)       6.45      6.19
+#WER dev93 (big-dict,fg)         5.83      5.78
+#WER eval92 (tgpr)               6.15      6.33
+#WER eval92 (tg)                 5.55      5.74
+#WER eval92 (big-dict,tgpr)      3.58      3.62
+#WER eval92 (big-dict,fg)        2.98      3.10
+# Final train prob        -0.7200   -0.6035
+# Final valid prob        -0.8834   -0.7578
+# Final train acc          0.7762    0.8015
+# Final valid acc          0.7301    0.7607
+
+
+set -e -o pipefail -u
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+tdnn_affix=1b  #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration.
+
+# Options which are not passed through to run_ivector_common.sh
+train_stage=-10
+remove_egs=true
+srand=0
+reporting_email=
+# set common_egs_dir to use previously dumped egs.
+common_egs_dir=
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh --stage $stage --nj $nj \
+                                  --train-set $train_set --gmm $gmm \
+                                  --num-threads-ubm $num_threads_ubm \
+                                  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn${tdnn_affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $gmm_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=750
+  relu-renorm-layer name=tdnn2 dim=750 input=Append(-1,2)
+  relu-renorm-layer name=tdnn3 dim=750 input=Append(-3,3)
+  relu-renorm-layer name=tdnn4 dim=750 input=Append(-7,2)
+  relu-renorm-layer name=tdnn5 dim=750 input=Append(-3,3)
+  relu-renorm-layer name=tdnn6 dim=750
+  output-layer name=output dim=$num_targets max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_dnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=3 \
+    --trainer.samples-per-iter=400000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0015 \
+    --trainer.optimization.final-effective-lrate=0.00015 \
+    --trainer.optimization.minibatch-size=256,128 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=data/lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # note: for TDNNs, looped decoding gives exactly the same results
+  # as regular decoding, so there is no point in testing it separately.
+  # We use regular decoding because it supports multi-threaded (we just
+  # didn't create the binary for that, for looped decoding, so far).
+  rm $dir/.error || true 2>/dev/null
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          ${graph_dir} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..6369fdc3fed
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,288 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_1a.sh is a TDNN+LSTM system.  Compare with the TDNN
+# system in run_tdnn_1a.sh.  Configuration is similar to
+# the same-named script run_tdnn_lstm_1a.sh in
+# egs/tedlium/s5_r2/local/nnet3/tuning.
+
+# It's a little better than the TDNN-only script on dev93, a little
+# worse on eval92.
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm1a_sp
+# exp/nnet3/tdnn_lstm1a_sp: num-iters=102 nj=3..10 num-params=8.8M dim=40+100->3413 combine=-0.55->-0.54 loglike:train/valid[67,101,combined]=(-0.63,-0.55,-0.55/-0.71,-0.63,-0.63) accuracy:train/valid[67,101,combined]=(0.80,0.82,0.82/0.76,0.78,0.78)
+
+
+
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp 2>/dev/null
+# local/nnet3/compare_wer.sh --looped --online exp/nnet3/tdnn1a_sp exp/nnet3/tdnn_lstm1a_sp
+# System                tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                9.18      8.54
+#             [looped:]                    8.54
+#             [online:]                    8.57
+#WER dev93 (tg)                  8.59      8.25
+#             [looped:]                    8.21
+#             [online:]                    8.34
+#WER dev93 (big-dict,tgpr)       6.45      6.24
+#             [looped:]                    6.28
+#             [online:]                    6.40
+#WER dev93 (big-dict,fg)         5.83      5.70
+#             [looped:]                    5.70
+#             [online:]                    5.77
+#WER eval92 (tgpr)               6.15      6.52
+#             [looped:]                    6.45
+#             [online:]                    6.56
+#WER eval92 (tg)                 5.55      6.13
+#             [looped:]                    6.08
+#             [online:]                    6.24
+#WER eval92 (big-dict,tgpr)      3.58      3.88
+#             [looped:]                    3.93
+#             [online:]                    3.88
+#WER eval92 (big-dict,fg)        2.98      3.38
+#             [looped:]                    3.47
+#             [online:]                    3.53
+# Final train prob        -0.7200   -0.5492
+# Final valid prob        -0.8834   -0.6343
+# Final train acc          0.7762    0.8154
+# Final valid acc          0.7301    0.7849
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lang=data/lang
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $gmm_dir/{graph_tgpr,graph_bd_tgpr}/HCLG.fst \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 12 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $ali_dir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$ali_dir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode.sh \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nj --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 15 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/nnet3/decode_looped.sh \
+          --frames-per-chunk 30 \
+          --nj $nj --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $graph_dir data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nj=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        graph_dir=$gmm_dir/graph_${lmtype}
+        steps/online/nnet3/decode.sh \
+          --nj $nj --cmd "$decode_cmd" \
+          $graph_dir data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
new file mode 100755
index 00000000000..6b1f98f04e7
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_1a_disc.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+### This script is not tested ###
+
+# This script does discriminative training on top of CE nnet3 system.
+#
+# note: this relies on having a cluster that has plenty of CPUs as well as GPUs,
+# since the alignment and the lattice generation/egs-dumping takes quite a bit
+# of CPU time.
+
+set -e
+set -uo pipefail
+
+stage=1
+train_stage=-10 # can be used to start training in the middle.
+get_egs_stage=0
+use_gpu=true  # for training
+cleanup=false  # run with --cleanup true --stage 6 to clean up (remove large things like
+               # alignments and degs).
+degs_dir=  # set this to use preexisting degs.
+nj=60 # have a high number of jobs because this could take a while, and we might
+       # have some stragglers.
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+
+## Objective options
+criterion=smbr
+one_silence_class=true
+
+# originally ran with effective_learning_rate=0.000005,
+# changing to effective_learning_rate=0.0000025 and using affix=slow
+
+# you can set --disc-affix if you run different configurations.
+disc_affix=
+
+## Egs options.  Give quite a few choices of chunk length,
+## so it can split utterances without much gap or overlap.
+frames_per_eg=300,280,150,120,100
+frames_overlap_per_eg=0
+frames_per_chunk_egs=200  # for alignments and denlat creation.
+frames_per_chunk_decoding=40  # for decoding; should be the same as the value
+                              # used in the script that trained the nnet.
+                              # We didn't set the frames_per_chunk in
+                              # run_tdnn_lstm_1a.sh, so it defaults to 50.
+## these context options should match the training condition. (chunk_left_context,
+## chunk_right_context)
+## We set --extra-left-context-initial 0 and --extra-right-context-final 0
+## directly in the script below, but this should also match the training condition.
+## note: --extra-left-context should be the same as the chunk_left_context (or in
+## general, the argument of --egs.chunk-left-context) in the baseline script.
+extra_left_context=40
+extra_right_context=0
+
+
+
+## Nnet training options
+effective_learning_rate=0.0000025
+last_layer_factor=0.5
+max_param_change=1
+num_jobs_nnet=4
+num_epochs=3
+regularization_opts=          # Applicable for providing --xent-regularize and --l2-regularize options,
+                              # in chain models.
+minibatch_size="300=32,16/150=64,32"  # rule says: if chunk size is closer to 300, use minibatch size 32 (or 16 for mop-up);
+                                      # if chunk size is closer to 150, use mini atch size of 64 (or 32 for mop-up).
+
+
+## Decode options
+decode_start_epoch=1 # can be used to avoid decoding all epochs, e.g. if we decided to run more.
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+gmm_dir=exp/tri4b
+srcdir=exp/nnet3/tdnn_lstm1a_sp
+train_data_dir=data/${train_set}_sp_hires
+online_ivector_dir=exp/nnet3/ivectors_${train_set}_sp_hires
+dir=${srcdir}_${criterion}${disc_affix}
+
+if $use_gpu; then
+  if ! cuda-compiled; then
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
+EOF
+  fi
+  num_threads=1
+else
+  # Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
+  # almost the same, but this may be a little bit slow.
+  num_threads=16
+fi
+
+if [[ $stage -le 2  && ! -f ${srcdir}/final.mdl ]]; then
+  echo "$0: expected ${srcdir}/final.mdl to exist for any stage <= 2"
+  exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # hardcode no-GPU for alignment, although you could use GPU [you wouldn't
+  # get excellent GPU utilization though.]
+  steps/nnet3/align.sh  --cmd "$decode_cmd" --use-gpu false \
+    --frames-per-chunk $frames_per_chunk_egs \
+    --extra-left-context $extra_left_context --extra-right-context $extra_right_context \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+    --online-ivector-dir $online_ivector_dir \
+    --nj $nj $train_data_dir data/lang $srcdir ${srcdir}_ali ;
+fi
+
+
+if [ -z "$degs_dir" ]; then
+
+  if [ $stage -le 2 ]; then
+    if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d ${srcdir}_degs/storage ]; then
+      utils/create_split_dir.pl \
+        /export/b{09,10,11,12}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/${srcdir}_degs/storage ${srcdir}_degs/storage
+    fi
+    if [ -d ${srcdir}_degs/storage ]; then max_copy_jobs=10; else max_copy_jobs=5; fi
+
+    steps/nnet3/get_degs.sh \
+      --cmd "$decode_cmd --mem 10G" --num-threads 3 \
+      --max-copy-jobs $max_copy_jobs \
+      --extra-left-context $extra_left_context \
+      --extra-right-context $extra_right_context \
+      --extra-left-context-initial 0 --extra-right-context-final 0 \
+      --frames-per-chunk-decoding "$frames_per_chunk_egs" \
+      --stage $get_egs_stage \
+      --online-ivector-dir $online_ivector_dir \
+      --frames-per-eg $frames_per_eg --frames-overlap-per-eg $frames_overlap_per_eg \
+      $train_data_dir data/lang ${srcdir} ${srcdir}_ali ${srcdir}_degs || exit 1
+  fi
+fi
+
+if [ $stage -le 3 ]; then
+  [ -z "$degs_dir" ] && degs_dir=${srcdir}_degs
+  steps/nnet3/train_discriminative.sh --cmd "$decode_cmd" \
+    --stage $train_stage \
+    --effective-lrate $effective_learning_rate --max-param-change $max_param_change \
+    --last-layer-factor $last_layer_factor \
+    --criterion $criterion --drop-frames true \
+    --num-epochs $num_epochs --one-silence-class $one_silence_class --minibatch-size "$minibatch_size" \
+    --num-jobs-nnet $num_jobs_nnet --num-threads $num_threads \
+    --regularization-opts "$regularization_opts" \
+    ${degs_dir} $dir
+fi
+
+if [ $stage -le 4 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for data in $test_sets; do
+      (
+        data_affix=$(echo $data | sed s/test_//)
+        nj=$(wc -l <data/${data}_hires/spk2utt)
+        for iter in epoch$x epoch${x}_adj; do
+          for lmtype in tgpr bd_tgpr; do
+            graph_dir=$gmm_dir/graph_${lmtype}
+            steps/nnet3/decode.sh \
+              --extra-left-context $extra_left_context \
+              --extra-right-context $extra_right_context \
+              --extra-left-context-initial 0 \
+              --extra-right-context-final 0 \
+              --frames-per-chunk $frames_per_chunk_decoding \
+              --nj $nj --cmd "$decode_cmd" --num-threads 4 --iter $iter \
+              --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+              $graph_dir data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}_${iter} || exit 1
+          done
+          steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+            data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix}_${iter} || exit 1
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_test_bd_{tgpr,fgconst} \
+           data/${data}_hires ${dir}/decode_bd_tgpr_${data_affix}{,_fg}_${iter} || exit 1
+        done
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for x in `seq $decode_start_epoch $num_epochs`; do
+    for data in $test_sets; do
+      (
+        data_affix=$(echo $data | sed s/test_//)
+        nj=$(wc -l <data/${data}_hires/spk2utt)
+        for iter in epoch$x epoch${x}_adj; do
+          for lmtype in tgpr bd_tgpr; do
+            graph_dir=$gmm_dir/graph_${lmtype}
+            steps/nnet3/decode_looped.sh \
+              --frames-per-chunk 30 \
+              --nj $nj --cmd "$decode_cmd" --iter $iter \
+              --online-ivector-dir exp/nnet3/ivectors_${data}_hires \
+              $graph_dir data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}_${iter} || exit 1
+          done
+          steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+            data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix}_${iter} || exit 1
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_test_bd_{tgpr,fgconst} \
+           data/${data}_hires ${dir}/decode_looped_bd_tgpr_${data_affix}{,_fg}_${iter} || exit 1
+        done
+      ) || touch $dir/.error &
+    done
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 6 ] && $cleanup; then
+  # if you run with "--cleanup true --stage 6" you can clean up.
+  # actually, keep the alignments in case we need them later.. they're slow to
+  # create, and quite big.
+  # rm ${srcdir}_ali/ali.*.gz || true
+
+  steps/nnet2/remove_egs.sh ${srcdir}_degs || true
+fi
+
+exit 0;
diff --git a/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
new file mode 100755
index 00000000000..f2a4ed37ae5
--- /dev/null
+++ b/egs/wsj/s5/local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh
@@ -0,0 +1,335 @@
+#!/bin/bash
+
+
+# run_tdnn_lstm_lfr_1a.sh is modified from the same named script
+# in egs/tedlium/s5_r2/local/nnet3/tuning/.
+# Of course reducing the hidden-dims).
+# This is a low-frame-rate TDNN+LSTM system.
+
+
+# steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_lstm_lfr1a_sp
+# exp/nnet3/tdnn_lstm_lfr1a_sp: num-iters=136 nj=3..10 num-params=8.7M dim=40+100->3205 combine=-0.43->-0.42 loglike:train/valid[89,135,combined]=(-0.51,-0.39,-0.38/-0.59,-0.51,-0.51) accuracy:train/valid[89,135,combined]=(0.85,0.88,0.88/0.82,0.84,0.84)
+
+
+# It seems to be a little worse the regular-frame-rate system.
+
+# local/nnet3/compare_wer.sh --looped exp/nnet3/tdnn_lstm1a_sp exp/nnet3/tdnn_lstm_lfr1a_sp
+# System                tdnn_lstm1a_sp tdnn_lstm_lfr1a_sp
+#WER dev93 (tgpr)                8.54      9.02
+#             [looped:]          8.54      8.99
+#WER dev93 (tg)                  8.25      8.60
+#             [looped:]          8.21      8.54
+#WER dev93 (big-dict,tgpr)       6.24      6.85
+#             [looped:]          6.28      6.81
+#WER dev93 (big-dict,fg)         5.70      6.33
+#             [looped:]          5.70      6.33
+#WER eval92 (tgpr)               6.52      6.52
+#             [looped:]          6.45      6.42
+#WER eval92 (tg)                 6.13      6.01
+#             [looped:]          6.08      5.92
+#WER eval92 (big-dict,tgpr)      3.88      4.22
+#             [looped:]          3.93      4.20
+#WER eval92 (big-dict,fg)        3.38      3.76
+#             [looped:]          3.47      3.79
+# Final train prob        -0.5492   -0.3100
+# Final valid prob        -0.6343   -0.4646
+# Final train acc          0.8154    0.9051
+# Final valid acc          0.7849    0.8615
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM options
+train_stage=-10
+label_delay=5
+
+# training chunk-options
+chunk_width=40,30,20
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+dir=exp/nnet3${nnet3_affix}/tdnn_lstm_lfr${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+treedir=exp/nnet3${nnet3_affix}/tree_lfr_a_sp
+# the 'lang' directory is created by this script; it's one
+# suitable for a low-frame-rate system such as this one.
+lang=data/lang_lfr_a
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 13 ]; then
+  # Build a tree using our new topology and a reduced sampling rate.
+  # We use 4000 leaves, which is a little less than the number used
+  # in the baseline GMM system (5k) in this setup, since generally
+  # LFR systems do best with somewhat fewer leaves.
+  #
+  # To get the stats to build the tree this script only uses every third frame,
+  # but it dumps converted alignments that essentially have 3 different
+  # frame-shifted versions of the alignment interpolated together; these can be
+  # used without modification in getting labels for training.
+  steps/nnet3/chain/build_tree.sh \
+    --repeat-frames true --frame-subsampling-factor 3 \
+    --cmd "$train_cmd" 4000 data/${train_set}_sp \
+    $lang $ali_dir $treedir
+fi
+
+
+if [ $stage -le 14 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-renorm-layer name=tdnn1 dim=520
+  relu-renorm-layer name=tdnn2 dim=520 input=Append(-1,0,1)
+  fast-lstmp-layer name=lstm1 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn3 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn4 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm2 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+  relu-renorm-layer name=tdnn5 dim=520 input=Append(-3,0,3)
+  relu-renorm-layer name=tdnn6 dim=520 input=Append(-3,0,3)
+  fast-lstmp-layer name=lstm3 cell-dim=520 recurrent-projection-dim=130 non-recurrent-projection-dim=130 decay-time=20 delay=-3
+
+  output-layer name=output input=lstm3 output-delay=$label_delay dim=$num_targets max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 15 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/train_rnn.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.samples-per-iter=10000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0003 \
+    --trainer.optimization.final-effective-lrate=0.00003 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --feat-dir=$train_data_dir \
+    --ali-dir=$treedir \
+    --lang=$lang \
+    --reporting.email="$reporting_email" \
+    --dir=$dir  || exit 1;
+  echo 3 >$dir/frame_subsampling_factor
+fi
+
+if [ $stage -le 16 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_tgpr \
+                   $dir $dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh --self-loop-scale 0.333 data/lang_test_bd_tgpr \
+      $dir $dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 18 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 0.333 --post-decode-acwt 3.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 0.333 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/online/run_nnet2_baseline.sh b/egs/wsj/s5/local/online/run_nnet2_baseline.sh
index 17d0face7e3..d1bd13bf7ae 100755
--- a/egs/wsj/s5/local/online/run_nnet2_baseline.sh
+++ b/egs/wsj/s5/local/online/run_nnet2_baseline.sh
@@ -12,13 +12,13 @@ use_gpu=true
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1" 
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
@@ -28,7 +28,7 @@ else
   # almost the same, but this may be a little bit slow.
   num_threads=16
   minibatch_size=128
-  parallel_opts="-pe smp $num_threads" 
+  parallel_opts="--num-threads $num_threads"
   dir=exp/nnet2_online/nnet_a_baseline
 fi
 
diff --git a/egs/wsj/s5/local/online/run_nnet2_discriminative.sh b/egs/wsj/s5/local/online/run_nnet2_discriminative.sh
index a92e9c3367b..1504375ec97 100755
--- a/egs/wsj/s5/local/online/run_nnet2_discriminative.sh
+++ b/egs/wsj/s5/local/online/run_nnet2_discriminative.sh
@@ -17,14 +17,14 @@ srcdir=exp/nnet2_online/nnet_ms_a
 
 if $use_gpu; then
   if ! cuda-compiled; then
-    cat <<EOF && exit 1 
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA 
+    cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
 If you want to use GPUs (and have them), go to src/, and configure and make on a machine
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  gpu_opts="-l gpu=1"
-  train_parallel_opts="-l gpu=1"
+  gpu_opts="--gpu 1"
+  train_parallel_opts="--gpu 1"
   num_threads=1
   # the _a is in case I want to change the parameters.
 else
@@ -32,19 +32,19 @@ else
   # almost the same, but this may be a little bit slow.
   gpu_opts=""
   num_threads=16
-  train_parallel_opts="-pe smp 16"
+  train_parallel_opts="--num-threads 16"
 fi
 
 nj=40
 
 if [ $stage -le 1 ]; then
- 
+
   # the make_denlats job is always done on CPU not GPU, since in any case
   # the graph search and lattice determinization takes quite a bit of CPU.
   # note: it's the sub-split option that determinies how many jobs actually
   # run at one time.
-  steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
-      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
+  steps/nnet2/make_denlats.sh --cmd "$decode_cmd --mem 1G" \
+      --nj $nj --sub-split 40 --num-threads 6 --parallel-opts "--num-threads 6" \
       --online-ivector-dir exp/nnet2_online/ivectors_train_si284 \
       data/train_si284_hires data/lang $srcdir ${srcdir}_denlats
 fi
@@ -77,9 +77,9 @@ if [ $stage -le 4 ]; then
   rm $error_file 2>/dev/null || true
 
   for epoch in 1 2 3 4; do
-    # do the actual online decoding with iVectors, carrying info forward from 
+    # do the actual online decoding with iVectors, carrying info forward from
     # previous utterances of the same speaker.
-    # We just do the bd_tgpr decodes; otherwise the number of combinations 
+    # We just do the bd_tgpr decodes; otherwise the number of combinations
     # starts to get very large.
     for lm_suffix in bd_tgpr; do
       graph_dir=exp/tri4b/graph_${lm_suffix}
diff --git a/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh b/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh
index 1a69e50f3ea..5dd14a435bb 100755
--- a/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh
+++ b/egs/wsj/s5/local/online/run_nnet2_perturb_speed.sh
@@ -21,7 +21,7 @@ If you want to use GPUs (and have them), go to src/, and configure and make on a
 where "nvcc" is installed.  Otherwise, call this script with --use-gpu false
 EOF
   fi
-  parallel_opts="-l gpu=1"
+  parallel_opts="--gpu 1"
   num_threads=1
   minibatch_size=512
   # the _a is in case I want to change the parameters.
diff --git a/egs/wsj/s5/local/run_basis_fmllr.sh b/egs/wsj/s5/local/run_basis_fmllr.sh
index caddd7af8de..db5207cc333 100755
--- a/egs/wsj/s5/local/run_basis_fmllr.sh
+++ b/egs/wsj/s5/local/run_basis_fmllr.sh
@@ -18,7 +18,7 @@ for x in test_eval92 test_eval93 test_dev93 ; do
   cp -r data/$x data/$y
   cat data/$x/utt2spk | awk '{print $1, $1;}' > data/$y/utt2spk;
   cp data/$y/utt2spk data/$y/spk2utt;
-  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1; 
+  steps/compute_cmvn_stats.sh data/$y exp/make_mfcc/$y $mfccdir || exit 1;
 done
 
 
@@ -33,7 +33,7 @@ steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
 
  # get the fMLLR basis.
 steps/get_fmllr_basis.sh --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri3b
+  data/train_si284 data/lang${lang_suffix} exp/tri3b
 
  # decoding tri3b with basis fMLLR
 steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
@@ -50,5 +50,3 @@ steps/decode_basis_fmllr.sh --nj 10 --cmd "$decode_cmd" \
 steps/decode_basis_fmllr.sh --nj 8 --cmd "$decode_cmd" \
   exp/tri3b/graph${lang_suffix}_tgpr data/test_eval92_utt \
   exp/tri3b/decode${lang_suffix}_tgpr_eval92_basis_utt || exit 1;
-
-
diff --git a/egs/wsj/s5/local/run_bnf_sgmm.sh b/egs/wsj/s5/local/run_bnf_sgmm.sh
index 6cfe1df67ed..8e2aadf214b 100644
--- a/egs/wsj/s5/local/run_bnf_sgmm.sh
+++ b/egs/wsj/s5/local/run_bnf_sgmm.sh
@@ -4,11 +4,11 @@
 
 . ./cmd.sh
 
-set -e 
+set -e
 set -o pipefail
 set -u
 
-# Set my_nj; typically 64.   
+# Set my_nj; typically 64.
 numLeaves=2500
 numGauss=15000
 numLeavesSGMM=10000
@@ -16,7 +16,7 @@ bnf_num_gauss_ubm=600
 bnf_num_gauss_sgmm=7000
 align_dir=exp/tri4b_ali_si284
 bnf_decode_acwt=0.0357
-sgmm_group_extra_opts=(--group 3 --cmd "queue.pl -l arch=*64 --mem 7G")
+sgmm_group_extra_opts=(--group 3 --cmd "queue.pl --mem 7G")
 
 if [ ! -d exp_bnf ]; then
   echo "$0: before running this script, please run local/run_bnf.sh"
@@ -64,7 +64,7 @@ echo "Starting exp_bnf/ubm7 on" `date`
 echo ---------------------------------------------------------------------
 if [ ! exp_bnf/ubm7/.done -nt exp_bnf/tri6/.done ]; then
   steps/train_ubm.sh \
-    $bnf_num_gauss_ubm data_bnf/train data/lang exp_bnf/tri6 exp_bnf/ubm7 
+    $bnf_num_gauss_ubm data_bnf/train data/lang exp_bnf/tri6 exp_bnf/ubm7
   touch exp_bnf/ubm7/.done
 fi
 
@@ -75,11 +75,11 @@ if [ ! exp_bnf/sgmm7/.done -nt exp_bnf/ubm7/.done ]; then
   steps/train_sgmm2_group.sh \
     "${sgmm_group_extra_opts[@]}"\
     $numLeavesSGMM $bnf_num_gauss_sgmm data_bnf/train data/lang \
-    exp_bnf/tri6 exp_bnf/ubm7/final.ubm exp_bnf/sgmm7 
+    exp_bnf/tri6 exp_bnf/ubm7/final.ubm exp_bnf/sgmm7
   touch exp_bnf/sgmm7/.done
 fi
 
-## SGMM2 decoding 
+## SGMM2 decoding
 decode1=exp_bnf/sgmm7/decode_bd_tgpr_eval92
 decode2=exp_bnf/sgmm7/decode_bd_tgpr_dev93
   echo ---------------------------------------------------------------------
@@ -104,7 +104,7 @@ if [ ! exp_bnf/sgmm7_ali/.done -nt exp_bnf/sgmm7/.done ]; then
   echo ---------------------------------------------------------------------
   steps/align_sgmm2.sh \
     --transform-dir exp_bnf/tri6 --nj 30 --use-graphs true \
-    data_bnf/train data/lang exp_bnf/sgmm7 exp_bnf/sgmm7_ali 
+    data_bnf/train data/lang exp_bnf/sgmm7 exp_bnf/sgmm7_ali
   touch exp_bnf/sgmm7_ali/.done
 fi
 
@@ -115,7 +115,7 @@ if [ ! exp_bnf/sgmm7_denlats/.done -nt exp_bnf/sgmm7/.done ]; then
   steps/make_denlats_sgmm2.sh \
      "${sgmm_denlats_extra_opts[@]}" \
     --transform-dir exp_bnf/tri6 --nj 30 --beam 14.0 --acwt $bnf_decode_acwt --lattice-beam 8 \
-     data_bnf/train data/lang exp_bnf/sgmm7_ali exp_bnf/sgmm7_denlats 
+     data_bnf/train data/lang exp_bnf/sgmm7_ali exp_bnf/sgmm7_denlats
   touch exp_bnf/sgmm7_denlats/.done
 fi
 
@@ -124,7 +124,7 @@ if [ ! exp_bnf/sgmm7_mmi_b0.1/.done -nt exp_bnf/sgmm7_denlats/.done ]; then
     --acwt $bnf_decode_acwt \
     --transform-dir exp_bnf/tri6 --boost 0.1 --drop-frames true \
     data_bnf/train data/lang exp_bnf/sgmm7_ali exp_bnf/sgmm7_denlats \
-    exp_bnf/sgmm7_mmi_b0.1 
+    exp_bnf/sgmm7_mmi_b0.1
   touch exp_bnf/sgmm7_mmi_b0.1/.done;
 fi
 
@@ -140,7 +140,7 @@ done
 
 for iter in 1 2 3 4; do
   # Decode SGMM+MMI (via rescoring).
-  decode2=exp_bnf/sgmm7_mmi_b0.1/decode_bd_tgpr_dev93_it$iter  
+  decode2=exp_bnf/sgmm7_mmi_b0.1/decode_bd_tgpr_dev93_it$iter
   mkdir -p $decode2
   steps/decode_sgmm2_rescore.sh  --skip-scoring false --cmd "$decode_cmd" \
     --iter $iter --transform-dir exp_bnf/tri6/decode_bd_tgpr_dev93 --scoring-opts "--min-lmwt 20 --max-lmwt 40" \
diff --git a/egs/wsj/s5/local/run_mmi_tri2b.sh b/egs/wsj/s5/local/run_mmi_tri2b.sh
deleted file mode 100755
index d7ddbfbaf62..00000000000
--- a/egs/wsj/s5/local/run_mmi_tri2b.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-
-lang_suffix=
-
-echo "$0 $@"  # Print the command line for logging
-. utils/parse_options.sh || exit 1;
-
-. ./cmd.sh
-
-# Train and test MMI (and boosted MMI) on tri2b system.
-steps/make_denlats.sh --sub-split 20 --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} \
-  exp/tri2b exp/tri2b_denlats_si84 || exit 1;
-
-# train the basic MMI system.
-steps/train_mmi.sh --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/tri2b_denlats_si84 exp/tri2b_mmi  || exit 1;
-for iter in 3 4; do
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_mmi/decode${lang_suffix}_tgpr_dev93_it$iter &
-  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \
-    exp/tri2b_mmi/decode${lang_suffix}_tgpr_eval92_it$iter &
-done
-
-# MMI with 0.1 boosting factor.
-steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/tri2b_denlats_si84 exp/tri2b_mmi_b0.1  || exit 1;
-
-for iter in 3 4; do
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-  steps/decode_si.sh --nj 8 --cmd "$decode_cmd" --iter $iter \
-     exp/tri2b/graph${lang_suffix}_tgpr data/test_eval92 \
-     exp/tri2b_mmi_b0.1/decode${lang_suffix}_tgpr_eval92_it$iter &
-done
-
-
-# Train a UBM with 400 components, for fMMI.
-steps/train_diag_ubm.sh --silence-weight 0.5 --nj 10 --cmd "$train_cmd" \
-  400 data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 exp/dubm2b
-
-steps/train_mmi_fmmi.sh --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1
-
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_fmmi_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
-
-steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_b0.1_lr0.005 || exit 1;
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-    exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-    exp/tri2b_fmmi_b0.1_lr0.005/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
-
-steps/train_mmi_fmmi_indirect.sh --boost 0.1 --cmd "$train_cmd" \
-  data/train_si84 data/lang${lang_suffix} exp/tri2b_ali_si84 \
-  exp/dubm2b exp/tri2b_denlats_si84 exp/tri2b_fmmi_indirect_b0.1
-for iter in `seq 3 8`; do 
-  steps/decode_fmmi.sh --nj 10 --cmd "$decode_cmd" --iter $iter \
-     exp/tri2b/graph${lang_suffix}_tgpr data/test_dev93 \
-     exp/tri2b_fmmi_indirect_b0.1/decode${lang_suffix}_tgpr_dev93_it$iter &
-done
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index fb004117658..de0c96fe387 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -1,7 +1,15 @@
 #!/bin/bash
 
+stage=0
+train=true   # set to false to disable the training-related scripts
+             # note: you probably only want to set --train false if you
+             # are using at least --stage 1.
+decode=true  # set to false to disable the decoding-related scripts.
+
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
+. utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
+
 
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.
@@ -18,334 +26,313 @@
 wsj0=/export/corpora5/LDC/LDC93S6B
 wsj1=/export/corpora5/LDC/LDC94S13B
 
-local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
 
-# Sometimes, we have seen WSJ distributions that do not have subdirectories
-# like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
-# wsj0 or wsj1 directories. In such cases, try the following:
-#
-# corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
-# local/cstr_wsj_data_prep.sh $corpus
-# rm data/local/dict/lexiconp.txt
-# $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
-#
-# "nosp" refers to the dictionary before silence probabilities and pronunciation
-# probabilities are added.
-local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
-
-utils/prepare_lang.sh data/local/dict_nosp \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
-
-local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
-
- # We suggest to run the next three commands in the background,
- # as they are not a precondition for the system building and
- # most of the tests: these commands build a dictionary
- # containing many of the OOVs in the WSJ LM training data,
- # and an LM trained directly on that data (i.e. not just
- # copying the arpa files from the disks from LDC).
- # Caution: the commands below will only work if $decode_cmd
- # is setup to use qsub.  Else, just remove the --cmd option.
- # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
- # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
+if [ $stage -le 0 ]; then
+  # data preparation.
+  local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?  || exit 1;
+
+  # Sometimes, we have seen WSJ distributions that do not have subdirectories
+  # like '11-13.1', but instead have 'doc', 'si_et_05', etc. directly under the
+  # wsj0 or wsj1 directories. In such cases, try the following:
+  #
+  # corpus=/exports/work/inf_hcrc_cstr_general/corpora/wsj
+  # local/cstr_wsj_data_prep.sh $corpus
+  # rm data/local/dict/lexiconp.txt
+  # $corpus must contain a 'wsj0' and a 'wsj1' subdirectory for this to work.
+  #
+  # "nosp" refers to the dictionary before silence probabilities and pronunciation
+  # probabilities are added.
+  local/wsj_prepare_dict.sh --dict-suffix "_nosp" || exit 1;
+
+  utils/prepare_lang.sh data/local/dict_nosp \
+                        "<SPOKEN_NOISE>" data/local/lang_tmp_nosp data/lang_nosp || exit 1;
+
+  local/wsj_format_data.sh --lang-suffix "_nosp" || exit 1;
+
+  # We suggest to run the next three commands in the background,
+  # as they are not a precondition for the system building and
+  # most of the tests: these commands build a dictionary
+  # containing many of the OOVs in the WSJ LM training data,
+  # and an LM trained directly on that data (i.e. not just
+  # copying the arpa files from the disks from LDC).
+  # Caution: the commands below will only work if $decode_cmd
+  # is setup to use qsub.  Else, just remove the --cmd option.
+  # NOTE: If you have a setup corresponding to the older cstr_wsj_data_prep.sh style,
+  # use local/cstr_wsj_extend_dict.sh --dict-suffix "_nosp" $corpus/wsj1/doc/ instead.
   (
-   local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
-   utils/prepare_lang.sh data/local/dict_nosp_larger \
-     "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
-   local/wsj_train_lms.sh --dict-suffix "_nosp" &&
-   local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
+    local/wsj_extend_dict.sh --dict-suffix "_nosp" $wsj1/13-32.1  && \
+      utils/prepare_lang.sh data/local/dict_nosp_larger \
+                            "<SPOKEN_NOISE>" data/local/lang_tmp_nosp_larger data/lang_nosp_bd && \
+      local/wsj_train_lms.sh --dict-suffix "_nosp" &&
+      local/wsj_format_local_lms.sh --lang-suffix "_nosp" # &&
   ) &
 
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
-
-for x in test_eval92 test_eval93 test_dev93 train_si284; do
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
-  steps/compute_cmvn_stats.sh data/$x || exit 1;
-done
-
-utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
-
-# Now make subset with the shortest 2k utterances from si-84.
-utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
-
-# Now make subset with half of the data from si-84.
-utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
-
-
-# Note: the --boost-silence option should probably be omitted by default
-# for normal setups.  It doesn't always help. [it's to discourage non-silence
-# models from modeling silence.]
-steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
-  data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/mono0a exp/mono0a/graph_nosp_tgpr && \
- steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
- steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
-   data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
-) &
-
-steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
-  data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
-
-steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
-  data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
-
-while [ ! -f data/lang_nosp_test_tgpr/tmp/LG.fst ] || \
-   [ -z data/lang_nosp_test_tgpr/tmp/LG.fst ]; do
-  sleep 20;
-done
-sleep 30;
-# or the mono mkgraph.sh might be writing
-# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
-
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
-
-steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
-  data/test_dev93 exp/tri1/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
-  data/test_eval92 exp/tri1/decode_nosp_tgpr_eval92 || exit 1;
-
-# test various modes of LM rescoring (4 is the default one).
-# This is just confirming they're equivalent.
-for mode in 1 2 3 4; do
-  steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
-    data/lang_nosp_test_{tgpr,tg} data/test_dev93 \
-    exp/tri1/decode_nosp_tgpr_dev93 \
-    exp/tri1/decode_nosp_tgpr_dev93_tg$mode  || exit 1;
-done
-
-
-## the following command demonstrates how to get lattices that are
-## "word-aligned" (arcs coincide with words, with boundaries in the right
-## place).
-#sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
-#steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
-#  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
-#  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
-
-steps/align_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-  --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
-steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
-  data/test_eval92 exp/tri2b/decode_nosp_tgpr_eval92 || exit 1;
-
-# At this point, you could run the example scripts that show how VTLN works.
-# We haven't included this in the default recipes yet.
-# local/run_vtln.sh --lang-suffix "_nosp"
-# local/run_vtln2.sh --lang-suffix "_nosp"
-
-# Now, with dev93, compare lattice rescoring with biglm decoding,
-# going from tgpr to tg.  Note: results are not the same, even though they should
-# be, and I believe this is due to the beams not being wide enough.  The pruning
-# seems to be a bit too narrow in the current scripts (got at least 0.7% absolute
-# improvement from loosening beams from their current values).
-
-steps/decode_biglm.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93_tg_biglm
-
-# baseline via LM rescoring of lattices.
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
-  data/test_dev93 exp/tri2b/decode_nosp_tgpr_dev93 \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg || exit 1;
-
-# Trying Minimum Bayes Risk decoding (like Confusion Network decoding):
-mkdir exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-cp exp/tri2b/decode_nosp_tgpr_dev93_tg/lat.*.gz \
-  exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-local/score_mbr.sh --cmd "$decode_cmd" \
- data/test_dev93/ data/lang_nosp_test_tgpr/ \
- exp/tri2b/decode_nosp_tgpr_dev93_tg_mbr
-
-# This script trains a delta+delta-delta system.  It's not really recommended or
+  # Now make MFCC features.
+  # mfccdir should be some place with a largish disk where you
+  # want to store MFCC features.
+
+  for x in test_eval92 test_eval93 test_dev93 train_si284; do
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 20 data/$x || exit 1;
+    steps/compute_cmvn_stats.sh data/$x || exit 1;
+  done
+
+  utils/subset_data_dir.sh --first data/train_si284 7138 data/train_si84 || exit 1
+
+  # Now make subset with the shortest 2k utterances from si-84.
+  utils/subset_data_dir.sh --shortest data/train_si84 2000 data/train_si84_2kshort || exit 1;
+
+  # Now make subset with half of the data from si-84.
+  utils/subset_data_dir.sh data/train_si84 3500 data/train_si84_half || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  # monophone
+
+
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  if $train; then
+    steps/train_mono.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_2kshort data/lang_nosp exp/mono0a || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr exp/mono0a exp/mono0a/graph_nosp_tgpr && \
+      steps/decode.sh --nj 10 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
+        data/test_dev93 exp/mono0a/decode_nosp_tgpr_dev93 && \
+      steps/decode.sh --nj 8 --cmd "$decode_cmd" exp/mono0a/graph_nosp_tgpr \
+        data/test_eval92 exp/mono0a/decode_nosp_tgpr_eval92
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # tri1
+  if $train; then
+    steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \
+      data/train_si84_half data/lang_nosp exp/mono0a exp/mono0a_ali || exit 1;
+
+    steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+      data/train_si84_half data/lang_nosp exp/mono0a_ali exp/tri1 || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri1 exp/tri1/graph_nosp_tgpr || exit 1;
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode.sh --nj $nspk --cmd "$decode_cmd" exp/tri1/graph_nosp_tgpr \
+        data/test_${data} exp/tri1/decode_nosp_tgpr_${data} || exit 1;
+
+      # test various modes of LM rescoring (4 is the default one).
+      # This is just confirming they're equivalent.
+      for mode in 1 2 3 4; do
+        steps/lmrescore.sh --mode $mode --cmd "$decode_cmd" \
+          data/lang_nosp_test_{tgpr,tg} data/test_dev93 \
+          exp/tri1/decode_nosp_tgpr_${data} \
+          exp/tri1/decode_nosp_tgpr_${data}_tg$mode  || exit 1;
+      done
+      # later on we'll demonstrate const-arpa LM rescoring, which is now
+      # the recommended method.
+    done
+
+    ## the following command demonstrates how to get lattices that are
+    ## "word-aligned" (arcs coincide with words, with boundaries in the right
+    ## place).
+    #sil_label=`grep '!SIL' data/lang_nosp_test_tgpr/words.txt | awk '{print $2}'`
+    #steps/word_align_lattices.sh --cmd "$train_cmd" --silence-label $sil_label \
+    #  data/lang_nosp_test_tgpr exp/tri1/decode_nosp_tgpr_dev93 \
+    #  exp/tri1/decode_nosp_tgpr_dev93_aligned || exit 1;
+  fi
+fi
+
+
+if [ $stage -le 3 ]; then
+  # tri2b.  there is no special meaning in the "b"-- it's historical.
+  if $train; then
+    steps/align_si.sh --nj 10 --cmd "$train_cmd" \
+      data/train_si84 data/lang_nosp exp/tri1 exp/tri1_ali_si84 || exit 1;
+
+    steps/train_lda_mllt.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" 2500 15000 \
+      data/train_si84 data/lang_nosp exp/tri1_ali_si84 exp/tri2b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri2b exp/tri2b/graph_nosp_tgpr || exit 1;
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode.sh --nj ${nspk} --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgpr \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} || exit 1;
+
+       # compare lattice rescoring with biglm decoding, going from tgpr to tg.
+      steps/decode_biglm.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri2b/graph_nosp_tgpr data/lang_test_{tgpr,tg}/G.fst \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data}_tg_biglm
+
+       # baseline via LM rescoring of lattices.
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_nosp_test_tgpr/ data/lang_nosp_test_tg/ \
+        data/test_${data} exp/tri2b/decode_nosp_tgpr_${data} \
+        exp/tri2b/decode_nosp_tgpr_${data}_tg || exit 1;
+
+      # Demonstrating Minimum Bayes Risk decoding (like Confusion Network decoding):
+      mkdir exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
+      cp exp/tri2b/decode_nosp_tgpr_${data}_tg/lat.*.gz \
+         exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr;
+      local/score_mbr.sh --cmd "$decode_cmd"  \
+         data/test_${data}/ data/lang_nosp_test_tgpr/ \
+         exp/tri2b/decode_nosp_tgpr_${data}_tg_mbr
+    done
+  fi
+
+  # At this point, you could run the example scripts that show how VTLN works.
+  # We haven't included this in the default recipes.
+  # local/run_vtln.sh --lang-suffix "_nosp"
+  # local/run_vtln2.sh --lang-suffix "_nosp"
+fi
+
+
+# local/run_delas.sh trains a delta+delta-delta system.  It's not really recommended or
 # necessary, but it does contain a demonstration of the decode_fromlats.sh
 # script which isn't used elsewhere.
 # local/run_deltas.sh
 
-# Align tri2b system with si84 data.
-steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
-  --use-graphs true data/train_si84 \
-  data/lang_nosp exp/tri2b exp/tri2b_ali_si84  || exit 1;
-
-local/run_mmi_tri2b.sh --lang-suffix "_nosp"
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \
-  data/train_si84 data/lang_nosp exp/tri2b_ali_si84 exp/tri3b || exit 1;
-utils/mkgraph.sh data/lang_nosp_test_tgpr \
-  exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
-steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-  exp/tri3b/graph_nosp_tgpr data/test_dev93 \
-  exp/tri3b/decode_nosp_tgpr_dev93 || exit 1;
-steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri3b/graph_nosp_tgpr data/test_eval92 \
-  exp/tri3b/decode_nosp_tgpr_eval92 || exit 1;
-
-# At this point you could run the command below; this gets
-# results that demonstrate the basis-fMLLR adaptation (adaptation
-# on small amounts of adaptation data).
-local/run_basis_fmllr.sh --lang-suffix "_nosp"
-
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
-  data/test_dev93 exp/tri3b/decode_nosp_tgpr_dev93 \
-  exp/tri3b/decode_nosp_tgpr_dev93_tg || exit 1;
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
-  data/test_eval92 exp/tri3b/decode_nosp_tgpr_eval92 \
-  exp/tri3b/decode_nosp_tgpr_eval92_tg || exit 1;
-
-# Trying the larger dictionary ("big-dict"/bd) + locally produced LM.
-utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
-  exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
-
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
-  exp/tri3b/graph_nosp_bd_tgpr data/test_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92 || exit 1;
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
-  exp/tri3b/graph_nosp_bd_tgpr data/test_dev93 \
-  exp/tri3b/decode_nosp_bd_tgpr_dev93 || exit 1;
-
-# Example of rescoring with ConstArpaLm.
-steps/lmrescore_const_arpa.sh \
-  --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92{,_fgconst} || exit 1;
-
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_bd_tgpr data/lang_nosp_test_bd_fg \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92_fg || exit 1;
-steps/lmrescore.sh --cmd "$decode_cmd" \
-  data/lang_nosp_test_bd_tgpr data/lang_nosp_test_bd_tg \
-  data/test_eval92 exp/tri3b/decode_nosp_bd_tgpr_eval92 \
-  exp/tri3b/decode_nosp_bd_tgpr_eval92_tg || exit 1;
-
-# The following two steps, which are a kind of side-branch, try mixing up
-( # from the 3b system.  This is to demonstrate that script.
- steps/mixup.sh --cmd "$train_cmd" \
-   20000 data/train_si84 data/lang_nosp exp/tri3b exp/tri3b_20k || exit 1;
- steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 \
-   exp/tri3b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri3b_20k/decode_nosp_tgpr_dev93  || exit 1;
-)
-
-# From 3b system, align all si284 data.
-steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/tri3b exp/tri3b_ali_si284 || exit 1;
-
-
-# From 3b system, train another SAT system (tri4a) with all the si284 data.
-
-steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
-  data/train_si284 data/lang_nosp exp/tri3b_ali_si284 exp/tri4a || exit 1;
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4a exp/tri4a/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4a/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4a/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-   exp/tri4a/graph_nosp_tgpr data/test_eval92 \
-   exp/tri4a/decode_nosp_tgpr_eval92 || exit 1;
-) &
-
-
-# This step is just to demonstrate the train_quick.sh script, in which we
-# initialize the GMMs from the old system's GMMs.
-steps/train_quick.sh --cmd "$train_cmd" 4200 40000 \
-  data/train_si284 data/lang_nosp exp/tri3b_ali_si284 exp/tri4b || exit 1;
-
-(
- utils/mkgraph.sh data/lang_nosp_test_tgpr \
-   exp/tri4b exp/tri4b/graph_nosp_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_nosp_tgpr data/test_dev93 \
-   exp/tri4b/decode_nosp_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_nosp_tgpr data/test_eval92 \
-  exp/tri4b/decode_nosp_tgpr_eval92 || exit 1;
-
- utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
-   exp/tri4b exp/tri4b/graph_nosp_bd_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_nosp_bd_tgpr data/test_dev93 \
-   exp/tri4b/decode_nosp_bd_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_nosp_bd_tgpr data/test_eval92 \
-  exp/tri4b/decode_nosp_bd_tgpr_eval92 || exit 1;
-) &
-
-# Silprob for normal lexicon.
-steps/get_prons.sh --cmd "$train_cmd" \
-  data/train_si284 data/lang_nosp exp/tri4b || exit 1;
-utils/dict_dir_add_pronprobs.sh --max-normalize true \
-  data/local/dict_nosp \
-  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
-
-utils/prepare_lang.sh data/local/dict \
-  "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
-
-for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
-  mkdir -p data/lang_test_${lm_suffix}
-  cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
-  rm -rf data/lang_test_${lm_suffix}/tmp
-  cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
-done
-
-# Silprob for larger lexicon.
-utils/dict_dir_add_pronprobs.sh --max-normalize true \
-  data/local/dict_nosp_larger \
-  exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \
-  exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
-
-utils/prepare_lang.sh data/local/dict_larger \
-  "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
-
-for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
-  mkdir -p data/lang_test_bd_${lm_suffix}
-  cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
-  rm -rf data/lang_test_bd_${lm_suffix}/tmp
-  cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
-done
-
-(
- utils/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92 || exit 1;
-
- utils/mkgraph.sh data/lang_test_bd_tgpr \
-   exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
- steps/decode_fmllr.sh --nj 10 --cmd "$decode_cmd" \
-   exp/tri4b/graph_bd_tgpr data/test_dev93 \
-   exp/tri4b/decode_bd_tgpr_dev93 || exit 1;
- steps/decode_fmllr.sh --nj 8 --cmd "$decode_cmd" \
-  exp/tri4b/graph_bd_tgpr data/test_eval92 \
-  exp/tri4b/decode_bd_tgpr_eval92 || exit 1;
-) &
+if [ $stage -le 4 ]; then
+  # From 2b system, train 3b which is LDA + MLLT + SAT.
+
+  # Align tri2b system with all the si284 data.
+  if $train; then
+    steps/align_si.sh  --nj 10 --cmd "$train_cmd" \
+      data/train_si284 data/lang_nosp exp/tri2b exp/tri2b_ali_si284  || exit 1;
+
+    steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+      data/train_si284 data/lang_nosp exp/tri2b_ali_si284 exp/tri3b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_nosp_test_tgpr \
+      exp/tri3b exp/tri3b/graph_nosp_tgpr || exit 1;
+
+    # the larger dictionary ("big-dict"/bd) + locally produced LM.
+    utils/mkgraph.sh data/lang_nosp_test_bd_tgpr \
+      exp/tri3b exp/tri3b/graph_nosp_bd_tgpr || exit 1;
+
+    # At this point you could run the command below; this gets
+    # results that demonstrate the basis-fMLLR adaptation (adaptation
+    # on small amounts of adaptation data).
+    # local/run_basis_fmllr.sh --lang-suffix "_nosp"
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri3b/graph_nosp_tgpr data/test_${data} \
+        exp/tri3b/decode_nosp_tgpr_${data} || exit 1;
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_nosp_test_tgpr data/lang_nosp_test_tg \
+        data/test_${data} exp/tri3b/decode_nosp_{tgpr,tg}_${data} || exit 1
+
+      # decode with big dictionary.
+      steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 8 \
+        exp/tri3b/graph_nosp_bd_tgpr data/test_${data} \
+        exp/tri3b/decode_nosp_bd_tgpr_${data} || exit 1;
+
+      # Example of rescoring with ConstArpaLm.
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_nosp_test_bd_{tgpr,fgconst} \
+        data/test_${data} exp/tri3b/decode_nosp_bd_tgpr_${data}{,_fg} || exit 1;
+    done
+  fi
+fi
+
+if [ $stage -le 5 ]; then
+  # Estimate pronunciation and silence probabilities.
+
+  # Silprob for normal lexicon.
+  steps/get_prons.sh --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/tri3b || exit 1;
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict || exit 1
+
+  utils/prepare_lang.sh data/local/dict \
+    "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+
+  for lm_suffix in bg bg_5k tg tg_5k tgpr tgpr_5k; do
+    mkdir -p data/lang_test_${lm_suffix}
+    cp -r data/lang/* data/lang_test_${lm_suffix}/ || exit 1;
+    rm -rf data/lang_test_${lm_suffix}/tmp
+    cp data/lang_nosp_test_${lm_suffix}/G.* data/lang_test_${lm_suffix}/
+  done
+
+  # Silprob for larger ("bd") lexicon.
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp_larger \
+    exp/tri3b/pron_counts_nowb.txt exp/tri3b/sil_counts_nowb.txt \
+    exp/tri3b/pron_bigram_counts_nowb.txt data/local/dict_larger || exit 1
+
+  utils/prepare_lang.sh data/local/dict_larger \
+    "<SPOKEN_NOISE>" data/local/lang_tmp_larger data/lang_bd || exit 1;
+
+  for lm_suffix in tgpr tgconst tg fgpr fgconst fg; do
+    mkdir -p data/lang_test_bd_${lm_suffix}
+    cp -r data/lang_bd/* data/lang_test_bd_${lm_suffix}/ || exit 1;
+    rm -rf data/lang_test_bd_${lm_suffix}/tmp
+    cp data/lang_nosp_test_bd_${lm_suffix}/G.* data/lang_test_bd_${lm_suffix}/
+  done
+fi
+
+
+if [ $stage -le 6 ]; then
+  # From 3b system, now using data/lang as the lang directory (we have now added
+  # pronunciation and silence probabilities), train another SAT system (tri4b).
+
+  if $train; then
+    steps/train_sat.sh  --cmd "$train_cmd" 4200 40000 \
+      data/train_si284 data/lang exp/tri3b exp/tri4b || exit 1;
+  fi
+
+  if $decode; then
+    utils/mkgraph.sh data/lang_test_tgpr \
+      exp/tri4b exp/tri4b/graph_tgpr || exit 1;
+    utils/mkgraph.sh data/lang_test_bd_tgpr \
+      exp/tri4b exp/tri4b/graph_bd_tgpr || exit 1;
+
+    for data in dev93 eval92; do
+      nspk=$(wc -l <data/test_${data}/spk2utt)
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri4b/graph_tgpr data/test_${data} \
+        exp/tri4b/decode_tgpr_${data} || exit 1;
+      steps/lmrescore.sh --cmd "$decode_cmd" \
+        data/lang_test_tgpr data/lang_test_tg \
+        data/test_${data} exp/tri4b/decode_{tgpr,tg}_${data} || exit 1
+
+      steps/decode_fmllr.sh --nj ${nspk} --cmd "$decode_cmd" \
+        exp/tri4b/graph_bd_tgpr data/test_${data} \
+        exp/tri4b/decode_bd_tgpr_${data} || exit 1;
+      steps/lmrescore_const_arpa.sh \
+        --cmd "$decode_cmd" data/lang_test_bd_{tgpr,fgconst} \
+        data/test_${data} exp/tri4b/decode_bd_tgpr_${data}{,_fg} || exit 1;
+    done
+  fi
+fi
+
+
+exit 0;
+
+### Caution: the parts of the script below this statement are not run by default.
+###
 
 
 # Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
 # all the data).  Use 30 jobs.
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
   data/train_si284 data/lang exp/tri4b exp/tri4b_ali_si284 || exit 1;
+local/run_mmi_tri4b.sh
 
 # These demonstrate how to build a sytem usable for online-decoding with the nnet2 setup.
 # (see local/run_nnet2.sh for other, non-online nnet2 setups).
@@ -357,7 +344,6 @@ local/online/run_nnet2_discriminative.sh
 # default.
 # local/run_rnnlms.sh
 
-local/run_mmi_tri4b.sh
 
 #local/run_nnet2.sh
 
diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
index b43e1752ee8..b0adb7e498c 100755
--- a/egs/wsj/s5/steps/info/chain_dir_info.pl
+++ b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -17,8 +17,8 @@
                "This script extracts some important information from the logs\n" .
                "and displays it on a single (rather long) line.\n" .
                "The --debug option is just to debug the script itself.\n" .
-               "This program exits with status 0 if it seems like the argument\n" .
-               "really was a GMM dir, and 1 otherwise.\n";
+               "This program exits with status 0 if it seems like the arguments\n" .
+               "really were of the expected directory type, and 1 otherwise.\n";
   exit(1);
 }
 
@@ -136,7 +136,7 @@ sub get_combine_info {
     while (<F>) {
       if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
-        return sprintf(" combine=%.2f->%.2f", $1, $2);
+        return sprintf(" combine=%.3f->%.3f", $1, $2);
       }
     }
   }
diff --git a/egs/wsj/s5/steps/info/nnet2_dir_info.pl b/egs/wsj/s5/steps/info/nnet2_dir_info.pl
index 6ef10a2e03d..e572245e0ca 100755
--- a/egs/wsj/s5/steps/info/nnet2_dir_info.pl
+++ b/egs/wsj/s5/steps/info/nnet2_dir_info.pl
@@ -17,8 +17,8 @@
                "This script extracts some important information from the logs\n" .
                "and displays it on a single (rather long) line.\n" .
                "The --debug option is just to debug the script itself.\n" .
-               "This program exits with status 0 if it seems like the argument\n" .
-               "really was a GMM dir, and 1 otherwise.\n";
+               "This program exits with status 0 if it seems like the arguments\n" .
+               "really were of the expected directory type, and 1 otherwise.\n";
   exit(1);
 }
 
diff --git a/egs/wsj/s5/steps/info/nnet3_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
index 89b4c398d46..46ddd9f822c 100755
--- a/egs/wsj/s5/steps/info/nnet3_dir_info.pl
+++ b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
@@ -17,8 +17,8 @@
                "This script extracts some important information from the logs\n" .
                "and displays it on a single (rather long) line.\n" .
                "The --debug option is just to debug the script itself.\n" .
-               "This program exits with status 0 if it seems like the argument\n" .
-               "really was a GMM dir, and 1 otherwise.\n";
+               "This program exits with status 0 if it seems like the arguments\n" .
+               "really were of the expected directory type, and 1 otherwise.\n";
   exit(1);
 }
 
diff --git a/egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl
new file mode 100755
index 00000000000..10bdb70fc9f
--- /dev/null
+++ b/egs/wsj/s5/steps/info/nnet3_disc_dir_info.pl
@@ -0,0 +1,172 @@
+#!/usr/bin/perl -w
+
+use Fcntl;
+
+# we may at some point support options.
+
+$debug = 0;  # we set it to 1 for debugging the script itself.
+
+if ($ARGV[0] eq "--debug") {
+  $debug = 1;
+  shift @ARGV;
+}
+
+if (@ARGV == 0) {
+  print STDERR "Usage: steps/info/nnet3_disc_dir_info.pl [--debug] <nnet3-disc-dir1> [<nnet3-disc-dir2> ... ]\n" .
+               "e.g: steps/info/nnet3_dir_info.pl exp/nnet3/tdnn_sp_smbr\n" .
+               "This script extracts some important information from the logs\n" .
+               "and displays it on a few lines.\n" .
+               "The --debug option is just to debug the script itself.\n" .
+               "This program exits with status 0 if it seems like the argument\n" .
+               "really was a GMM dir, and 1 otherwise.\n";
+  exit(1);
+}
+
+if (@ARGV > 1) {
+  # repeatedly invoke this program with each of the remaining args.
+  $exit_status = 0;
+  if ($debug) { $debug_opt = "--debug " } else { $debug_opt = ""; }
+  foreach $dir (@ARGV) {
+    if (system("$0 $debug_opt$dir") != 0) {
+      $exit_status = 1;
+    }
+  }
+  exit($exit_status);
+}
+
+# from this point we can assume we're invoked with one argument.
+$nnet_dir = shift @ARGV;
+
+# This function returns an array of iteration numbers, one
+# for each epoch that has already completed (but including
+# epoch zero)... e.g.
+# it might return (0, 194, 388, 582).
+# This is done by reading the soft links, e.g. epoch1.mdl ->194.mdl
+sub get_iters_for_epochs {
+  my @ans = ();
+  for (my $n = 0; 1; $n++) {
+    if (-l "$nnet_dir/epoch$n.mdl") {
+      my $link_name = readlink("$nnet_dir/epoch$n.mdl");
+      if ($link_name =~ m/^(\d+).mdl/) {
+        my $iter = $1;
+        push @ans, $iter;
+      } else {
+        die "unexpected link name $nnet_dir/epoch$n.mdl -> $link_name";
+      }
+    } else {
+      if (@ans == 0) {
+        die "$nnet_dir does not seem to be a discriminative-training dir " .
+          "(expected a link $nnet_dir/epoch0.mdl)";
+      }
+      return @ans;
+    }
+  }
+}
+
+
+sub get_num_jobs {
+  my $j = 1;
+  for (my $j = 1; 1; $j++) {
+    if (! -f "$nnet_dir/log/train.0.$j.log") {
+      if ($j == 1) {
+        die "$nnet_dir does not seem to be a discriminative-training dir " .
+          "(expected $nnet_dir/log/train.0.1.log to exist)";
+      } else {
+        return $j - 1;
+      }
+    }
+  }
+}
+
+# returns a string describing the effective learning rate and possibly
+# any final-layer-factor.
+sub get_effective_learning_rate_str {
+  # effective learning rate is the actual learning rate divided by the
+  # number of jobs.
+  my $convert_log = "$nnet_dir/log/convert.log";
+  if (-f $convert_log) {
+    open(F, "<$convert_log");
+    while (<F>) {
+      if (m/--edits/) {
+        if (m/set-learning-rate learning-rate=(\S+); set-learning-rate name=output.affine learning-rate=([^"']+)["']/) {
+          my $learning_rate = $1;
+          my $last_layer_factor = sprintf("%.2f", $2 / $1);
+          my $num_jobs = get_num_jobs();
+          my $effective_learning_rate = sprintf("%.3g", $learning_rate / $num_jobs);
+          close(F);
+          return "effective-lrate=$effective_learning_rate;last-layer-factor=$last_layer_factor";
+        } elsif (m/set-learning-rate learning-rate=([^"']+)["']/) {
+          my $learning_rate = $1;
+          my $num_jobs = get_num_jobs();
+          my $effective_learning_rate = sprintf("%.3g", $learning_rate / $num_jobs);
+          close(F);
+          return "effective-lrate=$effective_learning_rate";
+        }
+      }
+    }
+  } else {
+    die("Expected file $convert_log to exist");
+  }
+  close(F);
+  return "lrate=??";  # could not parse it from the log.
+}
+
+
+# prints some info about the objective function...
+sub get_objf_str {
+  my @iters_for_epochs = get_iters_for_epochs();
+  if (@iters_for_epochs == 1) {
+    die("No epochs have finished in directory $nnet_dir")
+  }
+  # will produce output like:
+  # iters-per-epoch=123;epoch[0,1,2,3,4]:train-objf=[0.89,0.92,0.93,0.94],valid-objf=[...],train-counts=[...],valid-counts=[...]"
+  # the "counts" are the average num+den occupation counts in the lattices; it's a measure of how much confusability
+  # there still is in the lattices.
+  my $iters_per_epoch = $iters_for_epochs[1] - $iters_for_epochs[0];
+  my $ans = "iters-per-epoch=$iters_per_epoch";
+  $ans .= ";epoch[" . join(",", 0..$#iters_for_epochs) . "]:";
+  my @train_objfs = ();
+  my @train_counts = ();
+  my @valid_objfs = ();
+  my @valid_counts = ();
+  foreach $iter (@iters_for_epochs) {
+    if ($iter > 0) { $iter -= 1; }  # last iter will not exist.
+    my $train_log = "$nnet_dir/log/compute_objf_train.$iter.log";
+    my $valid_log = "$nnet_dir/log/compute_objf_valid.$iter.log";
+    if (!open (T, "<$train_log")){  print STDERR "$0: warning: Expected file $train_log to exist\n"; }
+    if (!open (V, "<$valid_log")){  print STDERR "$0: warning: Expected file $valid_log to exist\n"; }
+    my $train_count = "??";
+    my $valid_count = "??";
+    my $train_objf = "??";
+    my $valid_objf = "??";
+    while (<T>) {
+      if (m/num\+den count.+is (\S+) per frame/) { $train_count = sprintf("%.2f", $1); }
+      if (m/Overall.+ is (\S+) per frame/) { $train_objf = sprintf("%.2f", $1); }
+    }
+    close(T);
+    while (<V>) {
+      if (m/num\+den count.+is (\S+) per frame/) { $valid_count = sprintf("%.2f", $1); }
+      if (m/Overall.+ is (\S+) per frame/) { $valid_objf = sprintf("%.2f", $1); }
+    }
+    push @train_objfs, $train_objf;
+    push @train_counts, $train_count;
+    push @valid_objfs, $valid_objf;
+    push @valid_counts, $valid_count;
+    close(V);
+  }
+  $ans .= "train-objf=[" . join(",", @train_objfs) .
+       "],valid-objf=[" . join(",", @valid_objfs) .
+       "],train-counts=[" . join(",", @train_counts) .
+       "],valid-counts=[" . join(",", @valid_counts) . "]";
+  return $ans;
+}
+
+
+
+
+$output_string = "$nnet_dir:num-jobs=".get_num_jobs().";" .
+     get_effective_learning_rate_str() . ";" . get_objf_str();
+
+print "$output_string\n";
+
+exit(0);
diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index 1e0608525ba..9d01fae3027 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -294,6 +294,16 @@ def get_ivector_dim(ivector_dir=None):
     ivector_dim = int(stdout_val)
     return ivector_dim
 
+def get_ivector_extractor_id(ivector_dir=None):
+    if ivector_dir is None:
+        return None
+    [stdout_val, stderr_val] = run_kaldi_command(
+        "steps/nnet2/get_ivector_id.sh {dir}".format(dir=ivector_dir))
+
+    if (stdout_val.strip() == "") or (stdout_val is None):
+        return None
+
+    return stdout_val.strip()
 
 def get_feat_dim(feat_dir):
     [stdout_val, stderr_val] = run_kaldi_command(
@@ -403,3 +413,4 @@ def write_idct_matrix(feat_dim, cepstral_lifter, file_path):
     for k in range(0, feat_dim):
         idct_matrix[k].append(0)
     write_kaldi_matrix(file_path, idct_matrix)
+
diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index 2d7f6f46cce..cdbbb00a68a 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -5,6 +5,7 @@
 # Apache 2.0.
 
 from __future__ import division
+import traceback
 import datetime
 import logging
 import re
@@ -332,20 +333,21 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
 
 def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"):
     times = parse_train_logs(exp_dir)
-    data = parse_prob_logs(exp_dir, key, output)
+
     report = []
     report.append("%Iter\tduration\ttrain_loss\tvalid_loss\tdifference")
+    try:
+        data = parse_prob_logs(exp_dir, key, output)
+    except:
+        tb = traceback.format_exc()
+        logger.warning("Error getting info from logs, exception was: " + tb)
+        data = []
     for x in data:
         try:
             report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]),
                                                   x[1], x[2], x[2]-x[1]))
         except KeyError:
             continue
-    if len(report) - 1 == 0:
-        raise KaldiLogParseException("Could not find any lines with {k} in "
-                " {e}/log/compute_prob_train.*.log or "
-                " {e}/log/compute_prob_valid.*.log or both".format(
-                    k=key, e=exp_dir))
 
     total_time = 0
     for iter in times.keys():
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 0d20b7c3287..f28aa89774e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -26,15 +26,26 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None):
     This method trains a phone LM for chain training using the alignments
     in "tree_dir"
     """
+    try:
+        f = open(tree_dir + "/num_jobs", 'r')
+        num_ali_jobs = int(f.readline())
+        assert num_ali_jobs > 0
+    except:
+        raise Exception("""There was an error getting the number of alignment
+                        jobs from {0}/num_jobs""".format(tree_dir))
+
+    alignments=' '.join(['{0}/ali.{1}.gz'.format(tree_dir, job)
+                         for job in range(1, num_ali_jobs + 1)])
+
     common_lib.run_job(
         """{command} {dir}/log/make_phone_lm.log \
-                chain-est-phone-lm {lm_opts} \
-                "ark:gunzip -c {tree_dir}/ali.*.gz | \
-                    ali-to-phones {tree_dir}/final.mdl ark:- ark:- |" \
-                {dir}/phone_lm.fst""".format(
-                    command=run_opts.command, dir=dir,
-                    lm_opts=lm_opts if lm_opts is not None else '',
-                    tree_dir=tree_dir))
+    gunzip -c {alignments} \| \
+    ali-to-phones {tree_dir}/final.mdl ark:- ark:- \| \
+    chain-est-phone-lm {lm_opts} ark:- {dir}/phone_lm.fst""".format(
+        command=run_opts.command, dir=dir,
+        alignments=alignments,
+        lm_opts=lm_opts if lm_opts is not None else '',
+        tree_dir=tree_dir))
 
 
 def create_denominator_fst(dir, tree_dir, run_opts):
@@ -52,12 +63,12 @@ def create_denominator_fst(dir, tree_dir, run_opts):
 def generate_chain_egs(dir, data, lat_dir, egs_dir,
                        left_context, right_context,
                        run_opts, stage=0,
-                       valid_left_context=None, valid_right_context=None,
                        left_tolerance=None, right_tolerance=None,
+                       left_context_initial=-1, right_context_final=-1,
                        frame_subsampling_factor=3,
                        alignment_subsampling_factor=3,
                        feat_type='raw', online_ivector_dir=None,
-                       frames_per_iter=20000, frames_per_eg=20, srand=0,
+                       frames_per_iter=20000, frames_per_eg_str="20", srand=0,
                        egs_opts=None, cmvn_opts=None, transform_dir=None):
     """Wrapper for steps/nnet3/chain/get_egs.sh
 
@@ -71,16 +82,17 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                 --feat-type {feat_type} \
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
-                --left-context {left_context} --right-context {right_context} \
-                --valid-left-context '{valid_left_context}' \
-                --valid-right-context '{valid_right_context}' \
+                --left-context {left_context} \
+                --right-context {right_context} \
+                --left-context-initial {left_context_initial} \
+                --right-context-final {right_context_final} \
                 --left-tolerance '{left_tolerance}' \
                 --right-tolerance '{right_tolerance}' \
                 --frame-subsampling-factor {frame_subsampling_factor} \
                 --alignment-subsampling-factor {alignment_subsampling_factor} \
                 --stage {stage} \
                 --frames-per-iter {frames_per_iter} \
-                --frames-per-eg {frames_per_eg} \
+                --frames-per-eg {frames_per_eg_str} \
                 --srand {srand} \
                 {data} {dir} {lat_dir} {egs_dir}""".format(
                     command=run_opts.command,
@@ -92,13 +104,10 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                     ivector_dir=(online_ivector_dir
                                  if online_ivector_dir is not None
                                  else ''),
-                    left_context=left_context, right_context=right_context,
-                    valid_left_context=(valid_left_context
-                                        if valid_left_context is not None
-                                        else ''),
-                    valid_right_context=(valid_right_context
-                                         if valid_right_context is not None
-                                         else ''),
+                    left_context=left_context,
+                    right_context=right_context,
+                    left_context_initial=left_context_initial,
+                    right_context_final=right_context_final,
                     left_tolerance=(left_tolerance
                                     if left_tolerance is not None
                                     else ''),
@@ -108,7 +117,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir,
                     frame_subsampling_factor=frame_subsampling_factor,
                     alignment_subsampling_factor=alignment_subsampling_factor,
                     stage=stage, frames_per_iter=frames_per_iter,
-                    frames_per_eg=frames_per_eg, srand=srand,
+                    frames_per_eg_str=frames_per_eg_str, srand=srand,
                     data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir,
                     egs_opts=egs_opts if egs_opts is not None else ''))
 
@@ -117,11 +126,11 @@ def train_new_models(dir, iter, srand, num_jobs,
                      num_archives_processed, num_archives,
                      raw_model_string, egs_dir, left_context, right_context,
                      apply_deriv_weights,
-                     min_deriv_time, max_deriv_time,
+                     min_deriv_time, max_deriv_time_relative,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
                      momentum, max_param_change,
-                     shuffle_buffer_size, num_chunk_per_minibatch,
-                     frame_subsampling_factor, truncate_deriv_weights,
+                     shuffle_buffer_size, num_chunk_per_minibatch_str,
+                     frame_subsampling_factor,
                      cache_io_opts, run_opts):
     """
     Called from train_one_iteration(), this method trains new models
@@ -139,9 +148,9 @@ def train_new_models(dir, iter, srand, num_jobs,
     if min_deriv_time is not None:
         deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
                                     min_deriv_time))
-    if max_deriv_time is not None:
-        deriv_time_opts.append("--optimization.max-deriv-time={0}".format(
-                                    int(max_deriv_time)))
+    if max_deriv_time_relative is not None:
+        deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format(
+                                    int(max_deriv_time_relative)))
 
     processes = []
     for job in range(1, num_jobs+1):
@@ -170,7 +179,6 @@ def train_new_models(dir, iter, srand, num_jobs,
                     "{raw_model}" {dir}/den.fst \
                     "ark,bg:nnet3-chain-copy-egs \
                         --left-context={lc} --right-context={rc} \
-                        --truncate-deriv-weights={trunc_deriv} \
                         --frame-shift={fr_shft} \
                         ark:{egs_dir}/cegs.{archive_index}.ark ark:- | \
                         nnet3-chain-shuffle-egs --buffer-size={buf_size} \
@@ -183,7 +191,6 @@ def train_new_models(dir, iter, srand, num_jobs,
                         next_iter=iter + 1, job=job,
                         deriv_time_opts=" ".join(deriv_time_opts),
                         lc=left_context, rc=right_context,
-                        trunc_deriv=truncate_deriv_weights,
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
                         xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
@@ -193,7 +200,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         egs_dir=egs_dir, archive_index=archive_index,
                         buf_size=shuffle_buffer_size,
                         cache_io_opts=cur_cache_io_opts,
-                        num_chunk_per_mb=num_chunk_per_minibatch),
+                        num_chunk_per_mb=num_chunk_per_minibatch_str),
             wait=False)
 
         processes.append(process_handle)
@@ -214,17 +221,16 @@ def train_new_models(dir, iter, srand, num_jobs,
 def train_one_iteration(dir, iter, srand, egs_dir,
                         num_jobs, num_archives_processed, num_archives,
                         learning_rate, shrinkage_value,
-                        num_chunk_per_minibatch,
+                        num_chunk_per_minibatch_str,
                         num_hidden_layers, add_layers_period,
                         left_context, right_context,
                         apply_deriv_weights, min_deriv_time,
-                        max_deriv_time,
+                        max_deriv_time_relative,
                         l2_regularize, xent_regularize,
                         leaky_hmm_coefficient,
                         momentum, max_param_change, shuffle_buffer_size,
-                        frame_subsampling_factor, truncate_deriv_weights,
-                        run_opts,
-                        dropout_edit_string="",
+                        frame_subsampling_factor,
+                        run_opts, dropout_edit_string="",
                         background_process_handler=None):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
@@ -294,7 +300,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                                                                  iter=iter)
 
     if do_average:
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch
+        cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str
         cur_max_param_change = max_param_change
     else:
         # on iteration zero or when we just added a layer, use a smaller
@@ -302,7 +308,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         # the jobs): the model-averaging isn't always helpful when the model is
         # changing too fast (i.e. it can worsen the objective function), and
         # the smaller minibatch size will help to keep the update stable.
-        cur_num_chunk_per_minibatch = num_chunk_per_minibatch / 2
+        cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str(
+            num_chunk_per_minibatch_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
     raw_model_string = raw_model_string + dropout_edit_string
@@ -324,16 +331,15 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      left_context=left_context, right_context=right_context,
                      apply_deriv_weights=apply_deriv_weights,
                      min_deriv_time=min_deriv_time,
-                     max_deriv_time=max_deriv_time,
+                     max_deriv_time_relative=max_deriv_time_relative,
                      l2_regularize=l2_regularize,
                      xent_regularize=xent_regularize,
                      leaky_hmm_coefficient=leaky_hmm_coefficient,
                      momentum=momentum,
                      max_param_change=cur_max_param_change,
                      shuffle_buffer_size=shuffle_buffer_size,
-                     num_chunk_per_minibatch=cur_num_chunk_per_minibatch,
+                     num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                      frame_subsampling_factor=frame_subsampling_factor,
-                     truncate_deriv_weights=truncate_deriv_weights,
                      cache_io_opts=cache_io_opts, run_opts=run_opts)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
@@ -475,7 +481,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
                 "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
                     --right-context={rc} ark:{egs_dir}/valid_diagnostic.cegs \
-                    ark:- | nnet3-chain-merge-egs ark:- ark:- |" \
+                    ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
         """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                    lc=left_context, rc=right_context,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
@@ -490,7 +496,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \
                 "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
                     --right-context={rc} ark:{egs_dir}/train_diagnostic.cegs \
-                    ark:- | nnet3-chain-merge-egs ark:- ark:- |" \
+                    ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \
         """.format(command=run_opts.command, dir=dir, iter=iter, model=model,
                    lc=left_context, rc=right_context,
                    l2=l2_regularize, leaky=leaky_hmm_coefficient,
@@ -519,10 +525,12 @@ def compute_progress(dir, iter, run_opts, wait=False,
         background_process_handler=background_process_handler)
 
 
-def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
+def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, left_context, right_context,
                    leaky_hmm_coefficient, l2_regularize,
-                   xent_regularize, run_opts, background_process_handler=None):
+                   xent_regularize, run_opts,
+                   background_process_handler=None,
+                   sum_to_one_penalty=0.0):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -535,20 +543,27 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
 
     models_to_combine.add(num_iters)
 
+    # TODO: if it turns out the sum-to-one-penalty code is not useful,
+    # remove support for it.
+
     for iter in sorted(models_to_combine):
         model_file = '{0}/{1}.mdl'.format(dir, iter)
         if os.path.exists(model_file):
-            raw_model_strings.append(
-                '"nnet3-am-copy --raw=true {0} -|"'.format(model_file))
+            # we used to copy them with nnet3-am-copy --raw=true, but now
+            # the raw-model-reading code discards the other stuff itself.
+            raw_model_strings.append(model_file)
         else:
             print("{0}: warning: model file {1} does not exist "
                   "(final combination)".format(sys.argv[0], model_file))
 
     common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-chain-combine --num-iters=40 \
+                nnet3-chain-combine --num-iters={opt_iters} \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                --separate-weights-per-component={separate_weights} \
+                --enforce-sum-to-one={hard_enforce} \
+                --sum-to-one-penalty={penalty} \
+                --enforce-positive-weights=true \
                 --verbose=3 {dir}/den.fst {raw_models} \
                 "ark,bg:nnet3-chain-copy-egs --left-context={lc} \
                     --right-context={rc} ark:{egs_dir}/combine.cegs ark:- | \
@@ -558,10 +573,14 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch,
                 {dir}/final.mdl""".format(
                     command=run_opts.command,
                     combine_queue_opt=run_opts.combine_queue_opt,
+                    opt_iters=(20 if sum_to_one_penalty <= 0 else 80),
+                    separate_weights=(sum_to_one_penalty > 0),
                     lc=left_context, rc=right_context,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
-                    num_chunk_per_mb=num_chunk_per_minibatch,
+                    hard_enforce=(sum_to_one_penalty <= 0),
+                    penalty=sum_to_one_penalty,
+                    num_chunk_per_mb=num_chunk_per_minibatch_str,
                     num_iters=num_iters,
                     egs_dir=egs_dir))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index c6ced36f127..54d4b0c3faa 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -17,14 +17,12 @@
 import shutil
 
 import libs.common as common_lib
-import libs.nnet3.train.dropout_schedule as dropout_schedule
-from dropout_schedule import *
 
 logger = logging.getLogger(__name__)
 logger.addHandler(logging.NullHandler())
 
 
-class RunOpts(object):
+class RunOpts:
     """A structure to store run options.
 
     Run options like queue.pl and run.pl, along with their memory
@@ -57,7 +55,7 @@ def get_successful_models(num_models, log_file_pattern,
         for line_num in range(1, len(lines) + 1):
             # we search from the end as this would result in
             # lesser number of regex searches. Python regex is slow !
-            mat_obj = parse_regex.search(lines[-1*line_num])
+            mat_obj = parse_regex.search(lines[-1 * line_num])
             if mat_obj is not None:
                 this_objf = float(mat_obj.groups()[0])
                 break
@@ -66,7 +64,7 @@ def get_successful_models(num_models, log_file_pattern,
     accepted_models = []
     for i in range(num_models):
         if (objf[max_index] - objf[i]) <= difference_threshold:
-            accepted_models.append(i+1)
+            accepted_models.append(i + 1)
 
     if len(accepted_models) != num_models:
         logger.warn("Only {0}/{1} of the models have been accepted "
@@ -74,7 +72,7 @@ def get_successful_models(num_models, log_file_pattern,
                         len(accepted_models),
                         num_models, log_file_pattern))
 
-    return [accepted_models, max_index+1]
+    return [accepted_models, max_index + 1]
 
 
 def get_average_nnet_model(dir, iter, nnets_list, run_opts,
@@ -137,16 +135,146 @@ def get_best_nnet_model(dir, iter, best_model_index, run_opts,
                                       out_model=out_model, scale=scale))
 
 
+def validate_chunk_width(chunk_width):
+    """Validate a chunk-width string , returns boolean.
+    Expected to be a string representing either an integer, like '20',
+    or a comma-separated list of integers like '20,30,16'"""
+    if not isinstance(chunk_width, str):
+        return False
+    a = chunk_width.split(",")
+    assert len(a) != 0  # would be code error
+    for elem in a:
+        try:
+            i = int(elem)
+            if i < 1:
+                return False
+        except:
+            return False
+    return True
+
+
+def principal_chunk_width(chunk_width):
+    """Given a chunk-width string like "20" or "50,70,40", returns the principal
+    chunk-width which is the first element, as an int.  E.g. 20, or 40."""
+    if not validate_chunk_width(chunk_width):
+        raise Exception("Invalid chunk-width {0}".format(chunk_width))
+    return int(chunk_width.split(",")[0])
+
+
+def validate_range_str(range_str):
+    """Helper function used inside validate_minibatch_size_str().
+    Returns true if range_str is a a comma-separated list of
+    positive integers and ranges of integers, like '128',
+    '128,256', or '64-128,256'."""
+    if not isinstance(range_str, str):
+        return False
+    ranges = range_str.split(",")
+    assert len(ranges) > 0
+    for r in ranges:
+        # a range may be either e.g. '64', or '128-256'
+        try:
+            c = [int(x) for x in r.split(":")]
+        except:
+            return False
+        # c should be either e.g. [ 128 ], or  [64,128].
+        if len(c) == 1:
+            if c[0] <= 0:
+                return False
+        elif len(c) == 2:
+            if c[0] <= 0 or c[1] < c[0]:
+                return False
+        else:
+            return False
+    return True
+
+
+def validate_minibatch_size_str(minibatch_size_str):
+    """Validate a minibatch-size string (returns bool).
+    A minibatch-size string might either be an integer, like '256',
+    a comma-separated set of integers or ranges like '128,256' or
+    '64:128,256',  or a rule like '128=64:128/256=32,64', whose format
+    is: eg-length1=size-range1/eg-length2=size-range2/....
+    where a size-range is a comma-separated list of either integers like '16'
+    or ranges like '16:32'.  An arbitrary eg will be mapped to the size-range
+    for the closest of the listed eg-lengths (the eg-length is defined
+    as the number of input frames, including context frames)."""
+    if not isinstance(minibatch_size_str, str):
+        return False
+    a = minibatch_size_str.split("/")
+    assert len(a) != 0  # would be code error
+
+    for elem in a:
+        b = elem.split('=')
+        # We expect b to have length 2 in the normal case.
+        if len(b) != 2:
+            # one-element 'b' is OK if len(a) is 1 (so there is only
+            # one choice)... this would mean somebody just gave "25"
+            # or something like that for the minibatch size.
+            if len(a) == 1 and len(b) == 1:
+                return validate_range_str(elem)
+            else:
+                return False
+        # check that the thing before the '=' sign is a positive integer
+        try:
+            i = b[0]
+            if i <= 0:
+                return False
+        except:
+            return False  # not an integer at all.
+
+        if not validate_range_str(b[1]):
+            return False
+    return True
+
+
+def halve_range_str(range_str):
+    """Helper function used inside halve_minibatch_size_str().
+    returns half of a range [but converting resulting zeros to
+    ones], e.g. '16'->'8', '16,32'->'8,16', '64:128'->'32:64'.
+    Returns true if range_str is a a comma-separated list of
+    positive integers and ranges of integers, like '128',
+    '128,256', or '64-128,256'."""
+
+    ranges = range_str.split(",")
+    halved_ranges = []
+    for r in ranges:
+        # a range may be either e.g. '64', or '128:256'
+        c = [str(max(1, int(x)/2)) for x in r.split(":")]
+        halved_ranges.append(":".join(c))
+    return ','.join(halved_ranges)
+
+
+def halve_minibatch_size_str(minibatch_size_str):
+    """Halve a minibatch-size string, as would be validated by
+    validate_minibatch_size_str (see docs for that).  This halves
+    all the integer elements of minibatch_size_str that represent minibatch
+    sizes (as opposed to chunk-lengths) and that are >1."""
+
+    if not validate_minibatch_size_str(minibatch_size_str):
+        raise Exception("Invalid minibatch-size string '{0}'".format(minibatch_size_str))
+
+    a = minibatch_size_str.split("/")
+    ans = []
+    for elem in a:
+        b = elem.split('=')
+        # We expect b to have length 2 in the normal case.
+        if len(b) == 1:
+            return halve_range_str(elem)
+        else:
+            assert len(b) == 2
+            ans.append('{0}={1}'.format(b[0], halve_range_str(b[1])))
+    return '/'.join(ans)
+
+
 def copy_egs_properties_to_exp_dir(egs_dir, dir):
     try:
-        for file in ['cmvn_opts', 'splice_opts', 'final.mat']:
+        for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat']:
             file_name = '{dir}/{file}'.format(dir=egs_dir, file=file)
             if os.path.isfile(file_name):
                 shutil.copy2(file_name, dir)
     except IOError:
-        logger.error("Error while trying to copy egs "
-                     "property files to {dir}".format(dir=dir))
-        raise
+        raise Exception("Error while trying to copy egs "
+                        "property files to {dir}".format(dir=dir))
 
 
 def parse_generic_config_vars_file(var_file):
@@ -173,37 +301,95 @@ def parse_generic_config_vars_file(var_file):
     raise Exception('Error while parsing the file {0}'.format(var_file))
 
 
-def verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                   left_context, right_context):
+def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id,
+                   left_context, right_context,
+                   left_context_initial=-1, right_context_final=-1):
     try:
         egs_feat_dim = int(open('{0}/info/feat_dim'.format(
                                     egs_dir)).readline())
+
+        egs_ivector_id = None
+        try:
+            egs_ivector_id = open('{0}/info/final.ie.id'.format(
+                                        egs_dir)).readline().strip()
+        except:
+            # it could actually happen that the file is not there
+            # for example in cases where the egs were dumped by
+            # an older version of the script
+            pass
+
         egs_ivector_dim = int(open('{0}/info/ivector_dim'.format(
                                     egs_dir)).readline())
         egs_left_context = int(open('{0}/info/left_context'.format(
                                     egs_dir)).readline())
         egs_right_context = int(open('{0}/info/right_context'.format(
                                     egs_dir)).readline())
+        try:
+            egs_left_context_initial = int(open('{0}/info/left_context_initial'.format(
+                        egs_dir)).readline())
+        except:  # older scripts didn't write this, treat it as -1 in that case.
+            egs_left_context_initial = -1
+        try:
+            egs_right_context_final = int(open('{0}/info/right_context_final'.format(
+                        egs_dir)).readline())
+        except:  # older scripts didn't write this, treat it as -1 in that case.
+            egs_right_context_final = -1
+
         if (feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim):
             raise Exception("There is mismatch between featdim/ivector_dim of "
                             "the current experiment and the provided "
                             "egs directory")
 
+        if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or
+            ((egs_ivector_id is not None) and (ivector_extractor_id is None))):
+            logger.warning("The ivector ids are inconsistently used. It's your "
+                          "responsibility to make sure the ivector extractor "
+                          "has been used consistently")
+        elif (((egs_ivector_id is None) and (ivector_extractor_id is None))):
+            logger.warning("The ivector ids are not used. It's your "
+                          "responsibility to make sure the ivector extractor "
+                          "has been used consistently")
+        elif (ivector_extractor_id != egs_ivector_id):
+            raise Exception("The egs were generated using a different ivector "
+                            "extractor. id1 = {0}, id2={1}".format(
+                                ivector_extractor_id, egs_ivector_id));
+
         if (egs_left_context < left_context or
                 egs_right_context < right_context):
-            raise Exception('The egs have insufficient context')
-
-        frames_per_eg = int(open('{0}/info/frames_per_eg'.format(
-                                    egs_dir)).readline())
+            raise Exception('The egs have insufficient (l,r) context ({0},{1}) '
+                            'versus expected ({2},{3})'.format(
+                            egs_left_context, egs_right_context,
+                            left_context, right_context))
+
+        # the condition on the initial/final context is an equality condition,
+        # not an inequality condition, as there is no mechanism to 'correct' the
+        # context (by subtracting context) while copying the egs, like there is
+        # for the regular left-right context.  If the user is determined to use
+        # previously dumped egs, they may be able to slightly adjust the
+        # --egs.chunk-left-context-initial and --egs.chunk-right-context-final
+        # options to make things matched up.  [note: the model l/r context gets
+        # added in, so you have to correct for changes in that.]
+        if (egs_left_context_initial != left_context_initial or
+            egs_right_context_final != right_context_final):
+            raise Exception('The egs have incorrect initial/final (l,r) context '
+                            '({0},{1}) versus expected ({2},{3}).  See code from '
+                            'where this exception was raised for more info'.format(
+                    egs_left_context_initial, egs_right_context_final,
+                    left_context_initial, right_context_final))
+
+        frames_per_eg_str = open('{0}/info/frames_per_eg'.format(
+                             egs_dir)).readline().rstrip()
+        if not validate_chunk_width(frames_per_eg_str):
+            raise Exception("Invalid frames_per_eg in directory {0}/info".format(
+                    egs_dir))
         num_archives = int(open('{0}/info/num_archives'.format(
                                     egs_dir)).readline())
 
         return [egs_left_context, egs_right_context,
-                frames_per_eg, num_archives]
-    except (IOError, ValueError):
-        logger.error("The egs dir {0} has missing or "
-                     "malformed files.".format(egs_dir))
-        raise
+                frames_per_eg_str, num_archives]
+    except (IOError, ValueError) as e:
+        raise Exception("The egs dir {0} has missing or "
+                        "malformed files: {1}".format(egs_dir, e.strerr))
 
 
 def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts,
@@ -254,12 +440,20 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
 
 
 def prepare_initial_network(dir, run_opts, srand=-3):
-    common_lib.run_job(
-        """{command} {dir}/log/add_first_layer.log \
-                nnet3-init --srand={srand} {dir}/init.raw \
-                {dir}/configs/layer1.config {dir}/0.raw""".format(
-                    command=run_opts.command, srand=srand,
-                    dir=dir))
+    if os.path.exists(dir+"/configs/init.config"):
+        common_lib.run_job(
+            """{command} {dir}/log/add_first_layer.log \
+                    nnet3-init --srand={srand} {dir}/init.raw \
+                    {dir}/configs/layer1.config {dir}/0.raw""".format(
+                        command=run_opts.command, srand=srand,
+                        dir=dir))
+    else:
+        common_lib.run_job(
+            """{command} {dir}/log/add_first_layer.log \
+                    nnet3-init --srand={srand} \
+                    {dir}/configs/layer1.config {dir}/0.raw""".format(
+                        command=run_opts.command, srand=srand,
+                        dir=dir))
 
 
 def verify_iterations(num_iters, num_epochs, num_hidden_layers,
@@ -276,16 +470,24 @@ def verify_iterations(num_iters, num_epochs, num_hidden_layers,
                         "layer-wise discriminatory training.")
 
     approx_iters_per_epoch_final = num_archives/num_jobs_final
+    # Note: it used to be that we would combine over an entire epoch,
+    # but in practice we very rarely would use any weights from towards
+    # the end of that range, so we are changing it to use not
+    # approx_iters_per_epoch_final, but instead:
+    # approx_iters_per_epoch_final/2 + 1,
+    # dividing by 2 to use half an epoch, and adding 1 just to make sure
+    # it's not zero.
+
     # First work out how many iterations we want to combine over in the final
     # nnet3-combine-fast invocation.
     # The number we use is:
-    # min(max(max_models_combine, approx_iters_per_epoch_final),
+    # min(max(max_models_combine, approx_iters_per_epoch_final/2+1),
     #     1/2 * iters_after_last_layer_added)
     # But if this value is > max_models_combine, then the models
     # are subsampled to get these many models to combine.
     half_iters_after_add_layers = (num_iters - finish_add_layers_iter)/2
 
-    num_iters_combine_initial = min(approx_iters_per_epoch_final,
+    num_iters_combine_initial = min(approx_iters_per_epoch_final/2 + 1,
                                     half_iters_after_add_layers)
 
     if num_iters_combine_initial > max_models_combine:
@@ -366,9 +568,9 @@ def clean_nnet_dir(nnet_dir, num_iters, egs_dir,
             remove_model(nnet_dir, iter, num_iters, None,
                          preserve_model_interval,
                          get_raw_nnet_from_am=get_raw_nnet_from_am)
-    except (IOError, OSError):
-        logger.error("Error while cleaning up the nnet directory")
-        raise
+    except (IOError, OSError) as err:
+        logger.warning("Error while cleaning up the nnet directory")
+        raise err
 
 
 def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
@@ -387,6 +589,15 @@ def remove_model(nnet_dir, iter, num_iters, models_to_combine=None,
         os.remove(file_name)
 
 
+def self_test():
+    assert halve_minibatch_size_str('64') == '32'
+    assert halve_minibatch_size_str('64,16:32') == '32,8:16'
+    assert halve_minibatch_size_str('1') == '1'
+    assert halve_minibatch_size_str('128=64/256=40,80:100') == '128=32/256=20,40:50'
+    assert validate_chunk_width('64')
+    assert validate_chunk_width('64,25,128')
+
+
 class CommonParser:
     """Parser for parsing common options related to nnet3 training.
 
@@ -398,7 +609,9 @@ class CommonParser:
 
     parser = argparse.ArgumentParser(add_help=False)
 
-    def __init__(self):
+    def __init__(self,
+                 include_chunk_context = True,
+                 default_chunk_left_context=0):
         # feat options
         self.parser.add_argument("--feat.online-ivector-dir", type=str,
                                  dest='online_ivector_dir', default=None,
@@ -411,22 +624,39 @@ def __init__(self):
                                  help="A string specifying '--norm-means' "
                                  "and '--norm-vars' values")
 
-        # egs extraction options
-        self.parser.add_argument("--egs.chunk-left-context", type=int,
-                                 dest='chunk_left_context', default=0,
-                                 help="""Number of additional frames of input
+        # egs extraction options.  there is no point adding the chunk context
+        # option for non-RNNs (by which we mean basic TDNN-type topologies), as
+        # it wouldn't affect anything, so we disable them if we know in advance
+        # that we're not supporting RNN-type topologies (as in train_dnn.py).
+        if include_chunk_context:
+            self.parser.add_argument("--egs.chunk-left-context", type=int,
+                                     dest='chunk_left_context',
+                                     default=default_chunk_left_context,
+                                     help="""Number of additional frames of input
                                  to the left of the input chunk. This extra
                                  context will be used in the estimation of RNN
                                  state before prediction of the first label. In
                                  the case of FF-DNN this extra context will be
                                  used to allow for frame-shifts""")
-        self.parser.add_argument("--egs.chunk-right-context", type=int,
+            self.parser.add_argument("--egs.chunk-right-context", type=int,
                                  dest='chunk_right_context', default=0,
                                  help="""Number of additional frames of input
                                  to the right of the input chunk. This extra
                                  context will be used in the estimation of
                                  bidirectional RNN state before prediction of
                                  the first label.""")
+            self.parser.add_argument("--egs.chunk-left-context-initial", type=int,
+                                     dest='chunk_left_context_initial', default=-1,
+                                     help="""Number of additional frames of input
+                                 to the left of the *first* input chunk extracted
+                                 from an utterance.  If negative, defaults to
+                                 the same as --egs.chunk-left-context""")
+            self.parser.add_argument("--egs.chunk-right-context-final", type=int,
+                                     dest='chunk_right_context_final', default=-1,
+                                     help="""Number of additional frames of input
+                                 to the right of the *last* input chunk extracted
+                                 from an utterance.  If negative, defaults to the
+                                 same as --egs.chunk-right-context""")
         self.parser.add_argument("--egs.transform_dir", type=str,
                                  dest='transform_dir', default=None,
                                  action=common_lib.NullstrToNoneAction,
@@ -458,8 +688,8 @@ def __init__(self):
                                  other random seeds used in other stages of the
                                  experiment like data preparation (e.g. volume
                                  perturbation).""")
-        self.parser.add_argument("--trainer.num-epochs", type=int,
-                                 dest='num_epochs', default=8,
+        self.parser.add_argument("--trainer.num-epochs", type=float,
+                                 dest='num_epochs', default=8.0,
                                  help="Number of epochs to train the model")
         self.parser.add_argument("--trainer.shuffle-buffer-size", type=int,
                                  dest='shuffle_buffer_size', default=5000,
@@ -528,37 +758,17 @@ def __init__(self):
                                  the final model combination stage.  These
                                  models will themselves be averages of
                                  iteration-number ranges""")
+        self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty",
+                                 type=float, dest='combine_sum_to_one_penalty', default=0.0,
+                                 help="""If > 0, activates 'soft' enforcement of the
+                                 sum-to-one penalty in combination (may be helpful
+                                 if using dropout).  E.g. 1.0e-03.""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
                                  Note: we implemented it in such a way that it
                                  doesn't increase the effective learning
                                  rate.""")
-        self.parser.add_argument("--trainer.dropout-schedule", type=str,
-                                 action=common_lib.NullstrToNoneAction,
-                                 dest='dropout_schedule', default=None,
-                                 help="""Use this to specify the dropout
-                                 schedule.  You specify a piecewise linear
-                                 function on the domain [0,1], where 0 is the
-                                 start and 1 is the end of training; the
-                                 function-argument (x) rises linearly with the
-                                 amount of data you have seen, not iteration
-                                 number (this improves invariance to
-                                 num-jobs-{initial-final}).  E.g. '0,0.2,0'
-                                 means 0 at the start; 0.2 after seeing half
-                                 the data; and 0 at the end.  You may specify
-                                 the x-value of selected points, e.g.
-                                 '0,0.2@0.25,0' means that the 0.2
-                                 dropout-proportion is reached a quarter of the
-                                 way through the data.   The start/end x-values
-                                 are at x=0/x=1, and other unspecified x-values
-                                 are interpolated between known x-values.  You
-                                 may specify different rules for different
-                                 component-name patterns using 'pattern1=func1
-                                 pattern2=func2', e.g. 'relu*=0,0.1,0
-                                 lstm*=0,0.2,0'.  More general should precede
-                                 less general patterns, as they are applied
-                                 sequentially.""")
 
         # General options
         self.parser.add_argument("--stage", type=int, default=-4,
@@ -622,3 +832,7 @@ def __init__(self):
                                  help="""Polling frequency in seconds at which
                                  the background process handler checks for
                                  errors in the processes.""")
+
+
+if __name__ == '__main__':
+    self_test()
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
index d9cf3112e4a..0ad93e5977d 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
@@ -13,6 +13,8 @@
 logger.addHandler(logging.NullHandler())
 
 
+_debug_dropout = False
+
 def _parse_dropout_option(dropout_option):
     """Parses the string option to --trainer.dropout-schedule and
     returns a list of dropout schedules for different component name patterns.
@@ -53,11 +55,12 @@ def _parse_dropout_option(dropout_option):
         this_dropout_values = _parse_dropout_string(this_dropout_str)
         dropout_schedule.append((component_name, this_dropout_values))
 
-    logger.info("Dropout schedules for component names is as follows:")
-    logger.info("<component-name-pattern>: [(num_archives_processed), "
-                "(dropout_proportion) ...]")
-    for name, schedule in dropout_schedule:
-        logger.info("{0}: {1}".format(name, schedule))
+    if _debug_dropout:
+        logger.info("Dropout schedules for component names is as follows:")
+        logger.info("<component-name-pattern>: [(num_archives_processed), "
+                    "(dropout_proportion) ...]")
+        for name, schedule in dropout_schedule:
+            logger.info("{0}: {1}".format(name, schedule))
 
     return dropout_schedule
 
@@ -236,7 +239,8 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
         dropout_info.append("pattern/dropout-proportion={0}/{1}".format(
             component_name, dropout_proportion))
 
-    logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))
+    if _debug_dropout:
+        logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))
     return ("""nnet3-copy --edits='{edits}' - - |""".format(
         edits=";".join(edit_config_lines)))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
index 1360f669f41..47265a19dba 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py
@@ -20,10 +20,10 @@
 
 def generate_egs(data, alidir, egs_dir,
                  left_context, right_context,
-                 valid_left_context, valid_right_context,
                  run_opts, stage=0,
+                 left_context_initial=-1, right_context_final=-1,
                  feat_type='raw', online_ivector_dir=None,
-                 samples_per_iter=20000, frames_per_eg=20, srand=0,
+                 samples_per_iter=20000, frames_per_eg_str="20", srand=0,
                  egs_opts=None, cmvn_opts=None, transform_dir=None):
 
     """ Wrapper for calling steps/nnet3/get_egs.sh
@@ -39,12 +39,13 @@ def generate_egs(data, alidir, egs_dir,
                 --feat-type {feat_type} \
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
-                --left-context {left_context} --right-context {right_context} \
-                --valid-left-context {valid_left_context} \
-                --valid-right-context {valid_right_context} \
+                --left-context {left_context} \
+                --right-context {right_context} \
+                --left-context-initial {left_context_initial} \
+                --right-context-final {right_context_final} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
-                --frames-per-eg {frames_per_eg} \
+                --frames-per-eg {frames_per_eg_str} \
                 --srand {srand} \
                 {data} {alidir} {egs_dir}
         """.format(command=run_opts.command,
@@ -56,11 +57,12 @@ def generate_egs(data, alidir, egs_dir,
                    ivector_dir=(online_ivector_dir
                                 if online_ivector_dir is not None
                                 else ''),
-                   left_context=left_context, right_context=right_context,
-                   valid_left_context=valid_left_context,
-                   valid_right_context=valid_right_context,
+                   left_context=left_context,
+                   right_context=right_context,
+                   left_context_initial=left_context_initial,
+                   right_context_final=right_context_final,
                    stage=stage, samples_per_iter=samples_per_iter,
-                   frames_per_eg=frames_per_eg, srand=srand, data=data,
+                   frames_per_eg_str=frames_per_eg_str, srand=srand, data=data,
                    alidir=alidir, egs_dir=egs_dir,
                    egs_opts=egs_opts if egs_opts is not None else ''))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 65a9c105e45..3e732313612 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -27,10 +27,10 @@ def train_new_models(dir, iter, srand, num_jobs,
                      raw_model_string, egs_dir,
                      left_context, right_context,
                      momentum, max_param_change,
-                     shuffle_buffer_size, minibatch_size,
+                     shuffle_buffer_size, minibatch_size_str,
                      cache_read_opt, run_opts,
                      frames_per_eg=-1,
-                     min_deriv_time=None, max_deriv_time=None):
+                     min_deriv_time=None, max_deriv_time_relative=None):
     """ Called from train_one_iteration(), this model does one iteration of
     training with 'num_jobs' jobs, and writes files like
     exp/tdnn_a/24.{1,2,3,..<num_jobs>}.raw
@@ -41,16 +41,12 @@ def train_new_models(dir, iter, srand, num_jobs,
     this is no longer true for RNNs as we use do not use the --frame option
     but we use the same script for consistency with FF-DNN code
 
-    Args:
+    Selected args:
         frames_per_eg: The default value -1 implies chunk_level_training, which
             is particularly applicable to RNN training. If it is > 0, then it
             implies frame-level training, which is applicable for DNN training.
             If it is > 0, then each parallel SGE job created, a different frame
             numbered 0..frames_per_eg-1 is used.
-        min_deriv_time: Applicable for RNN training. A default value of None
-            implies a min_deriv_time of 0 is used. During RNN training, its
-            value is set to chunk_width - num_bptt_steps in the training
-            script.
     """
 
     chunk_level_training = False if frames_per_eg > 0 else True
@@ -59,9 +55,9 @@ def train_new_models(dir, iter, srand, num_jobs,
     if min_deriv_time is not None:
         deriv_time_opts.append("--optimization.min-deriv-time={0}".format(
                            min_deriv_time))
-    if max_deriv_time is not None:
-        deriv_time_opts.append("--optimization.max-deriv-time={0}".format(
-                           max_deriv_time))
+    if max_deriv_time_relative is not None:
+        deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format(
+                           max_deriv_time_relative))
 
     context_opts = "--left-context={0} --right-context={1}".format(
         left_context, right_context)
@@ -75,7 +71,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         archive_index = (k % num_archives) + 1
 
         if not chunk_level_training:
-            frame = (k / num_archives) % frames_per_eg
+            frame = (k / num_archives + archive_index) % frames_per_eg
 
         cache_write_opt = ""
         if job == 1:
@@ -95,7 +91,7 @@ def train_new_models(dir, iter, srand, num_jobs,
             """ark:{egs_dir}/egs.{archive_index}.ark ark:- |"""
             """nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} """
             """--srand={srand} ark:- ark:- | """
-            """nnet3-merge-egs --minibatch-size={minibatch_size} """
+            """nnet3-merge-egs --minibatch-size={minibatch_size_str} """
             """--measure-output-frames=false """
             """--discard-partial-minibatches=true ark:- ark:- |" \
                     {dir}/{next_iter}.{job}.raw""".format(
@@ -115,7 +111,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         raw_model=raw_model_string, context_opts=context_opts,
                         egs_dir=egs_dir, archive_index=archive_index,
                         shuffle_buffer_size=shuffle_buffer_size,
-                        minibatch_size=minibatch_size), wait=False)
+                        minibatch_size_str=minibatch_size_str), wait=False)
 
         processes.append(process_handle)
 
@@ -134,29 +130,24 @@ def train_new_models(dir, iter, srand, num_jobs,
 
 def train_one_iteration(dir, iter, srand, egs_dir,
                         num_jobs, num_archives_processed, num_archives,
-                        learning_rate, minibatch_size,
+                        learning_rate, minibatch_size_str,
                         num_hidden_layers, add_layers_period,
                         left_context, right_context,
                         momentum, max_param_change, shuffle_buffer_size,
-                        run_opts,
-                        cv_minibatch_size=256, frames_per_eg=-1,
-                        min_deriv_time=None, max_deriv_time=None,
+                        run_opts, frames_per_eg=-1,
+                        min_deriv_time=None, max_deriv_time_relative=None,
                         shrinkage_value=1.0, dropout_edit_string="",
                         get_raw_nnet_from_am=True,
                         background_process_handler=None):
     """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
     network training
 
-    Args:
+    Selected args:
         frames_per_eg: The default value -1 implies chunk_level_training, which
             is particularly applicable to RNN training. If it is > 0, then it
             implies frame-level training, which is applicable for DNN training.
             If it is > 0, then each parallel SGE job created, a different frame
             numbered 0..frames_per_eg-1 is used.
-        min_deriv_time: Applicable for RNN training. A default value of None
-            implies a min_deriv_time of 0 is used. During RNN training, its
-            value is set to chunk_width - num_bptt_steps in the training
-            script.
         shrinkage_value: If value is 1.0, no shrinkage is done; otherwise
             parameter values are scaled by this value.
         get_raw_nnet_from_am: If True, then the network is read and stored as
@@ -191,7 +182,6 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         dir=dir, iter=iter, egs_dir=egs_dir,
         left_context=left_context, right_context=right_context,
         run_opts=run_opts,
-        mb_size=cv_minibatch_size,
         get_raw_nnet_from_am=get_raw_nnet_from_am, wait=False,
         background_process_handler=background_process_handler)
 
@@ -201,7 +191,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                          left_context=left_context,
                          right_context=right_context,
                          run_opts=run_opts,
-                         mb_size=cv_minibatch_size, wait=False,
+                         wait=False,
                          get_raw_nnet_from_am=get_raw_nnet_from_am,
                          background_process_handler=background_process_handler)
 
@@ -252,7 +242,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
     raw_model_string = raw_model_string + dropout_edit_string
 
     if do_average:
-        cur_minibatch_size = minibatch_size
+        cur_minibatch_size_str = minibatch_size_str
         cur_max_param_change = max_param_change
     else:
         # on iteration zero or when we just added a layer, use a smaller
@@ -260,7 +250,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
         # the jobs): the model-averaging isn't always helpful when the model is
         # changing too fast (i.e. it can worsen the objective function), and
         # the smaller minibatch size will help to keep the update stable.
-        cur_minibatch_size = minibatch_size / 2
+        cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str)
         cur_max_param_change = float(max_param_change) / math.sqrt(2)
 
     try:
@@ -284,11 +274,11 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      left_context=left_context, right_context=right_context,
                      momentum=momentum, max_param_change=cur_max_param_change,
                      shuffle_buffer_size=shuffle_buffer_size,
-                     minibatch_size=cur_minibatch_size,
+                     minibatch_size_str=cur_minibatch_size_str,
                      cache_read_opt=cache_read_opt, run_opts=run_opts,
                      frames_per_eg=frames_per_eg,
                      min_deriv_time=min_deriv_time,
-                     max_deriv_time=max_deriv_time)
+                     max_deriv_time_relative=max_deriv_time_relative)
 
     [models_to_average, best_model] = common_train_lib.get_successful_models(
          num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter))
@@ -387,7 +377,7 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
 
 
 def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
-                                   right_context, run_opts, mb_size=256,
+                                   right_context, run_opts,
                                    wait=False, background_process_handler=None,
                                    get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
@@ -404,12 +394,11 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/valid_diagnostic.egs ark:- | \
-                    nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                    nnet3-merge-egs --minibatch-size=1:64 ark:- \
                     ark:- |" """.format(command=run_opts.command,
                                         dir=dir,
                                         iter=iter,
                                         context_opts=context_opts,
-                                        mb_size=mb_size,
                                         model=model,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
@@ -419,20 +408,18 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, left_context,
                 nnet3-compute-prob "{model}" \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/train_diagnostic.egs ark:- | \
-                    nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                    nnet3-merge-egs --minibatch-size=1:64 ark:- \
                     ark:- |" """.format(command=run_opts.command,
                                         dir=dir,
                                         iter=iter,
                                         context_opts=context_opts,
-                                        mb_size=mb_size,
                                         model=model,
                                         egs_dir=egs_dir),
         wait=wait, background_process_handler=background_process_handler)
 
 
 def compute_progress(dir, iter, egs_dir, left_context, right_context,
-                     run_opts, mb_size=256,
-                     background_process_handler=None, wait=False,
+                     run_opts, background_process_handler=None, wait=False,
                      get_raw_nnet_from_am=True):
     if get_raw_nnet_from_am:
         prev_model = "nnet3-am-copy --raw=true {0}/{1}.mdl - |".format(
@@ -451,13 +438,12 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
                     nnet3-show-progress --use-gpu=no "{prev_model}" "{model}" \
                     "ark,bg:nnet3-copy-egs {context_opts} \
                         ark:{egs_dir}/train_diagnostic.egs ark:- | \
-                        nnet3-merge-egs --minibatch-size={mb_size} ark:- \
+                        nnet3-merge-egs --minibatch-size=1:64 ark:- \
                         ark:- |" """.format(command=run_opts.command,
                                             dir=dir,
                                             iter=iter,
                                             model=model,
                                             context_opts=context_opts,
-                                            mb_size=mb_size,
                                             prev_model=prev_model,
                                             egs_dir=egs_dir),
             wait=wait, background_process_handler=background_process_handler)
@@ -465,8 +451,10 @@ def compute_progress(dir, iter, egs_dir, left_context, right_context,
 
 def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    left_context, right_context,
+                   minibatch_size_str,
                    run_opts, background_process_handler=None,
-                   chunk_width=None, get_raw_nnet_from_am=True):
+                   chunk_width=None, get_raw_nnet_from_am=True,
+                   sum_to_one_penalty=0.0):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -492,12 +480,6 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                 raise Exception('Model file {0} missing'.format(model_file))
             raw_model_strings.append(model_file)
 
-    if chunk_width is not None:
-        # this is an RNN model
-        mbsize = int(1024.0/(chunk_width))
-    else:
-        mbsize = 1024
-
     if get_raw_nnet_from_am:
         out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl "
                      "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters))
@@ -509,8 +491,10 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
 
     common_lib.run_job(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-combine --num-iters=40 \
-                --enforce-sum-to-one=true --enforce-positive-weights=true \
+                nnet3-combine --num-iters=80 \
+                --enforce-sum-to-one={hard_enforce} \
+                --sum-to-one-penalty={penalty} \
+                --enforce-positive-weights=true \
                 --verbose=3 {raw_models} \
                 "ark,bg:nnet3-copy-egs {context_opts} \
                     ark:{egs_dir}/combine.egs ark:- | \
@@ -520,8 +504,10 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
                    dir=dir, raw_models=" ".join(raw_model_strings),
+                   hard_enforce=(sum_to_one_penalty <= 0),
+                   penalty=sum_to_one_penalty,
                    context_opts=context_opts,
-                   mbsize=mbsize,
+                   mbsize=minibatch_size_str,
                    out_model=out_model,
                    egs_dir=egs_dir))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
index 58240dd2f1b..037abeb1dd8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py
@@ -19,11 +19,11 @@
 
 def generate_egs_using_targets(data, targets_scp, egs_dir,
                                left_context, right_context,
-                               valid_left_context, valid_right_context,
                                run_opts, stage=0,
+                               left_context_initial=-1, right_context_final=-1,
                                feat_type='raw', online_ivector_dir=None,
                                target_type='dense', num_targets=-1,
-                               samples_per_iter=20000, frames_per_eg=20,
+                               samples_per_iter=20000, frames_per_eg_str="20",
                                srand=0, egs_opts=None, cmvn_opts=None,
                                transform_dir=None):
     """ Wrapper for calling steps/nnet3/get_egs_targets.sh
@@ -55,12 +55,13 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                 --feat-type {feat_type} \
                 --transform-dir "{transform_dir}" \
                 --online-ivector-dir "{ivector_dir}" \
-                --left-context {left_context} --right-context {right_context} \
-                --valid-left-context {valid_left_context} \
-                --valid-right-context {valid_right_context} \
+                --left-context {left_context} \
+                --right-context {right_context} \
+                --left-context-initial {left_context_initial} \
+                --right-context-final {right_context_final} \
                 --stage {stage} \
                 --samples-per-iter {samples_per_iter} \
-                --frames-per-eg {frames_per_eg} \
+                --frames-per-eg {frames_per_eg_str} \
                 --srand {srand} \
                 --target-type {target_type} \
                 --num-targets {num_targets} \
@@ -74,11 +75,12 @@ def generate_egs_using_targets(data, targets_scp, egs_dir,
                    ivector_dir=(online_ivector_dir
                                 if online_ivector_dir is not None
                                 else ''),
-                   left_context=left_context, right_context=right_context,
-                   valid_left_context=valid_left_context,
-                   valid_right_context=valid_right_context,
+                   left_context=left_context,
+                   right_context=right_context,
+                   left_context_initial=left_context_initial,
+                   right_context_final=right_context_final,
                    stage=stage, samples_per_iter=samples_per_iter,
-                   frames_per_eg=frames_per_eg, srand=srand,
+                   frames_per_eg_str=frames_per_eg_str, srand=srand,
                    num_targets=num_targets,
                    data=data,
                    targets_scp=targets_scp, target_type=target_type,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 3726eebeb6e..59b6006accb 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -1,5 +1,6 @@
 # Copyright 2016    Johns Hopkins University (Dan Povey)
 #           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
 # Apache 2.0.
 
 """ This module contains the parent class from which all layers are inherited
@@ -441,6 +442,12 @@ class XconfigOutputLayer(XconfigLayerBase):
             -0.25 is referred to as presoftmax_prior_scale_power in scripts. In
             the scripts this would normally be set to
             config_dir/presoftmax_prior_scale.vec
+        output-delay=0    :  Can be used to shift the frames on the output, equivalent
+             to delaying labels by this many frames (positive value increases latency
+             in online decoding but may help if you're using unidirectional LSTMs.
+        ng-affine-options=''  :   Can be used supply non-default options to the affine
+             layer (intended for the natural gradient but can be an arbitrary string
+             to be added to the config line.  e.g. 'update-period=2'.).
     """
 
     def __init__(self, first_token, key_to_value, prev_names = None):
@@ -466,7 +473,8 @@ def set_default_configs(self):
                        'max-change' : 1.5,
                        'param-stddev' : 0.0,
                        'bias-stddev' : 0.0,
-                       'output-delay' : 0
+                       'output-delay' : 0,
+                       'ng-affine-options' : ''
                       }
 
     def check_configs(self):
@@ -529,6 +537,7 @@ def get_full_config(self):
         bias_stddev = self.config['bias-stddev']
         output_delay = self.config['output-delay']
         max_change = self.config['max-change']
+        ng_affine_options = self.config['ng-affine-options']
 
         # note: ref.config is used only for getting the left-context and
         # right-context of the network;
@@ -541,9 +550,9 @@ def get_full_config(self):
                     ' output-dim={2}'
                     ' param-stddev={3}'
                     ' bias-stddev={4}'
-                    ' max-change={5} '
+                    ' max-change={5} {6} '
                     ''.format(self.name, input_dim, output_dim,
-                        param_stddev, bias_stddev, max_change) +
+                              param_stddev, bias_stddev, max_change, ng_affine_options) +
                     ('learning-rate-factor={0} '.format(learning_rate_factor)
                      if learning_rate_factor != 1.0 else ''))
             ans.append((config_name, line))
@@ -690,7 +699,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
         self_repair_scale = self.config['self-repair-scale']
         target_rms = self.config['target-rms']
         max_change = self.config['max-change']
-        ng_opt_str = self.config['ng-affine-options']
+        ng_affine_options = self.config['ng-affine-options']
 
         configs = []
         # First the affine node.
@@ -701,7 +710,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                 ' max-change={3}'
                 ' {4}'
                 ''.format(self.name, input_dim, output_dim,
-                    max_change, ng_opt_str))
+                    max_change, ng_affine_options))
         configs.append(line)
 
         line = ('component-node name={0}.affine'
@@ -767,7 +776,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
 #   input='[-1]'             [Descriptor giving the input of the layer.]
 #   dim=None                   [Output dimension of layer; defaults to the same as the input dim.]
 #   affine-transform-file='' [Must be specified.]
-#
+#   delay=0                  [Optional delay for the output-node in init.config]
 class XconfigFixedAffineLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == 'fixed-affine-layer'
@@ -778,7 +787,9 @@ def set_default_configs(self):
         # the most recent layer.
         self.config = { 'input':'[-1]',
                         'dim':-1,
-                        'affine-transform-file':''}
+                        'affine-transform-file':'',
+                        'delay':0,
+                        'write-init-config':True}
 
     def check_configs(self):
         if self.config['affine-transform-file'] is None:
@@ -809,12 +820,20 @@ def get_full_config(self):
         output_dim = self.output_dim()
         transform_file = self.config['affine-transform-file']
 
-
-        # to init.config we write an output-node with the name 'output' and
-        # with a Descriptor equal to the descriptor that's the input to this
-        # layer.  This will be used to accumulate stats to learn the LDA transform.
-        line = 'output-node name=output input={0}'.format(descriptor_final_string)
-        ans.append(('init', line))
+        if self.config['write-init-config']:
+            if self.config['delay'] != 0:
+                line = 'component name={0}.delayed type=NoOpComponent dim={1}'.format(self.name, input_dim)
+                ans.append(('init', line))
+                line = 'component-node name={0}.delayed component={0}.delayed input={1}'.format(self.name, descriptor_final_string)
+                ans.append(('init', line))
+                line = 'output-node name=output input=Offset({0}.delayed, {1})'.format(self.name, self.config['delay'])
+                ans.append(('init', line))
+            else:
+                # to init.config we write an output-node with the name 'output' and
+                # with a Descriptor equal to the descriptor that's the input to this
+                # layer.  This will be used to accumulate stats to learn the LDA transform.
+                line = 'output-node name=output input={0}'.format(descriptor_final_string)
+                ans.append(('init', line))
 
         # write the 'real' component to final.config
         line = 'component name={0} type=FixedAffineComponent matrix={1}'.format(
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
index 353b9d3bba4..fa356d15a18 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -5,4 +5,3 @@
 
 from basic_layers import *
 from lstm import *
-from tdnn import *
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 7c5f262a7f5..4ffebcd9436 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -30,6 +30,15 @@
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''     [Additional options used for the diagonal matrices in the LSTM ]
 #   ng-affine-options=''                [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "lstm-layer"
@@ -44,7 +53,8 @@ def set_default_configs(self):
                         'ng-affine-options' : ' max-change=0.75 ',
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 15.0
+                        'zeroing-threshold' : 15.0,
+                        'decay-time':  -1.0
                         }
 
     def set_derived_configs(self):
@@ -56,6 +66,9 @@ def check_configs(self):
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
 
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
         for key in ['self-repair-scale-nonlinearity']:
             if self.config[key] < 0.0 or self.config[key] > 1.0:
                 raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key]))
@@ -105,17 +118,23 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
-
-        repair_nonlin = self.config['self-repair-scale-nonlinearity']
-        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
+                                abs(delay), recurrence_scale))
+        repair_nonlin = self.config['self-repair-scale-nonlinearity']
+        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
         affine_str = self.config['ng-affine-options']
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
@@ -230,6 +249,15 @@ def generate_lstm_config(self):
 #                                       i.e.,  SigmoidComponent, TanhComponent and RectifiedLinearComponent ]
 #   ng-per-element-scale-options=''   [Additional options used for the diagonal matrices in the LSTM ]
 #   ng-affine-options=''              [Additional options used for the full matrices in the LSTM, can be used to do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "lstmp-layer"
@@ -248,7 +276,9 @@ def set_default_configs(self):
                         'self-repair-scale-nonlinearity' : 0.00001,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        'dropout-proportion' : -1.0 # -1.0 stands for no dropout will be added
+                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : False,  # If false, regular dropout, not per frame.
+                        'decay-time':  -1.0
                        }
 
     def set_derived_configs(self):
@@ -269,6 +299,9 @@ def check_configs(self):
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
 
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
+
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
@@ -284,7 +317,7 @@ def check_configs(self):
              self.config['dropout-proportion'] < 0.0) and
              self.config['dropout-proportion'] != -1.0 ):
              raise RuntimeError("dropout-proportion has invalid value {0}."
-                                "".format(self.config['dropout-proportion']))
+                                .format(self.config['dropout-proportion']))
 
     def auxiliary_outputs(self):
         return ['c_t']
@@ -335,18 +368,25 @@ def generate_lstm_config(self):
         delay = self.config['delay']
         repair_nonlin = self.config['self-repair-scale-nonlinearity']
         repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
+                                abs(delay), recurrence_scale))
         affine_str = self.config['ng-affine-options']
         pes_str = self.config['ng-per-element-scale-options']
-        lstm_dropout_value = self.config['dropout-proportion']
-        lstm_dropout_str = 'dropout-proportion='+str(self.config['dropout-proportion'])
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false'
 
         # Natural gradient per element scale parameters
         # TODO: decide if we want to keep exposing these options
@@ -383,7 +423,10 @@ def generate_lstm_config(self):
         configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
         configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
         configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-
+        if dropout_proportion != -1.0:
+            configs.append("component name={0}.dropout type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim, dropout_proportion, dropout_per_frame))
         configs.append("# Defining the components for other cell computations")
         configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
         configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
@@ -398,17 +441,29 @@ def generate_lstm_config(self):
         configs.append("# i_t")
         configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.i2_t component={0}.w_i.c  input={1}".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.i_t_predrop component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
+            configs.append("component-node name={0}.i_t component={0}.dropout input={0}.i_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
 
         configs.append("# f_t")
         configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.f_t_predrop component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
+            configs.append("component-node name={0}.f_t component={0}.dropout input={0}.f_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
 
         configs.append("# o_t")
         configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
-        configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.o_t_predrop component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
+            configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name))
+        else:
+            configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
 
         configs.append("# h_t")
         configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
@@ -417,134 +472,6 @@ def generate_lstm_config(self):
         configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay))
         configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
 
-        configs.append("# parts of c_t")
-        configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
-        configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
-
-        configs.append("# m_t")
-        configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name))
-
-        # add the recurrent connections
-        configs.append("# projection matrices : Wrm and Wpm")
-        if lstm_dropout_value != -1.0:
-            configs.append("component name={0}.W_rp.m.dropout type=DropoutComponent dim={1} {2}".format(name, cell_dim, lstm_dropout_str))
-        configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str))
-        configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str))
-
-        configs.append("# r_t and p_t : rp_t will be the output")
-        if lstm_dropout_value != -1.0:
-            configs.append("component-node name={0}.rp_t.dropout component={0}.W_rp.m.dropout input={0}.m_t".format(name))
-            configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.rp_t.dropout".format(name))
-            configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
-            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
-        else:
-            configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
-            configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
-            configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
-
-        return configs
-
-# Same as the LSTMP layer except that the matrix multiplications are combined
-# we probably keep only version after experimentation. One year old experiments
-# show that this version is slightly worse and might require some tuning
-class XconfigLstmpcLayer(XconfigLstmpLayer):
-    def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "lstmpc-layer"
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-    # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
-        # assign some variables to reduce verbosity
-        name = self.name
-        # in the below code we will just call descriptor_strings as descriptors for conciseness
-        input_dim = self.descriptors['input']['dim']
-        input_descriptor = self.descriptors['input']['final-string']
-        cell_dim = self.config['cell-dim']
-        rec_proj_dim = self.config['recurrent-projection-dim']
-        nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        delay = self.config['delay']
-
-        repair_nonlin = self.config['self-repair-scale-nonlinearity']
-        repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else ''
-        bptrunc_str = ("clipping-threshold={0}"
-                      " zeroing-threshold={1}"
-                      " zeroing-interval={2}"
-                      " recurrence-interval={3}"
-                      "".format(self.config['clipping-threshold'],
-                                self.config['zeroing-threshold'],
-                                self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        # Natural gradient per element scale parameters
-        # TODO: decide if we want to keep exposing these options
-        if re.search('param-mean', ng_per_element_scale_options) is None and \
-           re.search('param-stddev', ng_per_element_scale_options) is None:
-           ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 "
-        pes_str = ng_per_element_scale_options
-
-        configs = []
-        # naming convention
-        # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
-        configs.append("### Begin LTSM layer '{0}'".format(name))
-        configs.append("# Full W_ifoc* matrix")
-        configs.append("component name={0}.W_ifoc.xr type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, 4*cell_dim, affine_str))
-        configs.append("# note : the cell outputs pass through a diagonal matrix")
-
-        # we will not combine the diagonal matrix operations as one of these has a different delay
-        configs.append("# note : the cell outputs pass through a diagonal matrix")
-        configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-        configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-        configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent  dim={1} {2}".format(name, cell_dim, pes_str))
-
-        configs.append("# Defining the non-linearities")
-        configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-        configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str))
-
-        configs.append("# Defining the components for other cell computations")
-        configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim))
-        configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str))
-
-        # c1_t and c2_t defined below
-        configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name))
-        delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay)
-        rec_connection = '{0}.rp_t'.format(name)
-
-        component_nodes.append("component-node name={0}.ifoc_t component={0}.W_ifoc.xr input=Append({1}, IfDefined(Offset({0}_{2}, {3})))".format(name, input_descriptor, recurrent_connection, lstm_delay))
-
-
-        offset = 0
-        component_nodes.append("# i_t")
-        component_nodes.append("dim-range-node name={0}.i1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.i2_t component={0}.w_i.cinput={1}".format(name, delayed_c_t_descriptor))
-        component_nodes.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name))
-
-        component_nodes.append("# f_t")
-        component_nodes.append("dim-range-node name={0}.f1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.f2_t component={0}.w_f.c  input={1}".format(name, delayed_c_t_descriptor))
-        component_nodes.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name))
-
-        component_nodes.append("# o_t")
-        component_nodes.append("dim-range-node name={0}.o1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name))
-        component_nodes.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name))
-
-        component_nodes.append("# h_t")
-        component_nodes.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name))
-
-        component_nodes.append("# g_t")
-        component_nodes.append("dim-range-node name={0}.g1_t input-node={0}.ifoc_t dim-offset={1} dim={2}".format(name, offset, cell_dim))
-        offset += cell_dim
-        component_nodes.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name))
-
-
         configs.append("# parts of c_t")
         configs.append("component-node name={0}.c1_t component={0}.c1  input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor))
         configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name))
@@ -561,7 +488,6 @@ def generate_lstm_config(self):
         configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t".format(name))
         configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 dim={1}".format(name, rec_proj_dim))
         configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
-        configs.append("### End LTSM layer '{0}'".format(name))
 
         return configs
 
@@ -588,6 +514,15 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstm-layer"
@@ -607,7 +542,8 @@ def set_default_configs(self):
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
-                        'ng-affine-options' : ' max-change=1.5'
+                        'ng-affine-options' : ' max-change=1.5',
+                        'decay-time':  -1.0
                         }
         self.c_needed = False  # keep track of whether the 'c' output is needed.
 
@@ -619,6 +555,8 @@ def check_configs(self):
         key = 'cell-dim'
         if self.config['cell-dim'] <= 0:
             raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key]))
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
 
 
 
@@ -666,15 +604,22 @@ def generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         delay = self.config['delay']
+        affine_str = self.config['ng-affine-options']
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
+                                abs(delay), recurrence_scale))
         lstm_str = self.config['lstm-nonlinearity-options']
 
 
@@ -739,6 +684,15 @@ def generate_lstm_config(self):
 #   lstm-nonlinearity-options=' max-change=0.75 '  [Options string to pass into the LSTM nonlinearity component.]
 #   ng-affine-options=' max-change=1.5 '           [Additional options used for the full matrices in the LSTM, can be used to
 #                                      do things like set biases to initialize to 1]
+#   decay-time=-1            [If >0, an approximate maximum on how many frames
+#                            can be remembered via summation into the cell
+#                            contents c_t; enforced by putting a scaling factor
+#                            of recurrence_scale = 1 - abs(delay)/decay_time on
+#                            the recurrence, i.e. the term c_{t-1} in the LSTM
+#                            equations.  E.g. setting this to 20 means no more
+#                            than about 20 frames' worth of history,
+#                            i.e. history since about t = t-20, can be
+#                            accumulated in c_t.]
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
         assert first_token == "fast-lstmp-layer"
@@ -759,9 +713,11 @@ def set_default_configs(self):
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
+                        'decay-time':  -1.0,
                         'zeroing-interval' : 20,
-                        'zeroing-threshold' : 15.0
-
+                        'zeroing-threshold' : 15.0,
+                        'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added
+                        'dropout-per-frame' : False  # If false, regular dropout, not per frame.
                         }
 
     def set_derived_configs(self):
@@ -775,18 +731,25 @@ def set_derived_configs(self):
             self.config['non-recurrent-projection-dim'] = \
                self.config['recurrent-projection-dim']
 
+
     def check_configs(self):
         for key in ['cell-dim', 'recurrent-projection-dim',
                     'non-recurrent-projection-dim']:
             if self.config[key] <= 0:
                 raise RuntimeError("{0} has invalid value {1}.".format(
                     key, self.config[key]))
-
+        if self.config['delay'] == 0:
+            raise RuntimeError("delay cannot be zero")
         if (self.config['recurrent-projection-dim'] +
             self.config['non-recurrent-projection-dim'] >
             self.config['cell-dim']):
             raise RuntimeError("recurrent+non-recurrent projection dim exceeds "
                                 "cell dim")
+        if ((self.config['dropout-proportion'] > 1.0 or
+             self.config['dropout-proportion'] < 0.0) and
+             self.config['dropout-proportion'] != -1.0 ):
+            raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion']))
+
 
 
     def auxiliary_outputs(self):
@@ -833,20 +796,29 @@ def generate_lstm_config(self):
         input_dim = self.descriptors['input']['dim']
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
+        delay = self.config['delay']
         rec_proj_dim = self.config['recurrent-projection-dim']
         nonrec_proj_dim = self.config['non-recurrent-projection-dim']
-        delay = self.config['delay']
+        affine_str = self.config['ng-affine-options']
+        decay_time = self.config['decay-time']
+        # we expect decay_time to be either -1, or large, like 10 or 50.
+        recurrence_scale = (1.0 if decay_time < 0 else
+                            1.0 - (abs(delay) / decay_time))
+        assert recurrence_scale > 0   # or user may have set decay-time much
+                                      # too small.
         bptrunc_str = ("clipping-threshold={0}"
                       " zeroing-threshold={1}"
                       " zeroing-interval={2}"
                       " recurrence-interval={3}"
+                      " scale={4}"
                       "".format(self.config['clipping-threshold'],
                                 self.config['zeroing-threshold'],
                                 self.config['zeroing-interval'],
-                                abs(delay)))
-        affine_str = self.config['ng-affine-options']
-        lstm_str = self.config['lstm-nonlinearity-options']
+                                abs(delay), recurrence_scale))
 
+        lstm_str = self.config['lstm-nonlinearity-options']
+        dropout_proportion = self.config['dropout-proportion']
+        dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false'
 
         configs = []
 
@@ -865,6 +837,10 @@ def generate_lstm_config(self):
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent "
                        "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str))
+        if dropout_proportion != -1.0:
+            configs.append("component name={0}.cr_trunc.dropout type=DropoutComponent dim={1} "
+                           "dropout-proportion={2} dropout-per-frame={3}"
+                           .format(name, cell_dim + rec_proj_dim, dropout_proportion, dropout_per_frame))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
         configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
@@ -888,9 +864,17 @@ def generate_lstm_config(self):
         configs.append("# makes the deriv truncation more accurate .")
         configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc "
                        "input=Append({0}.c, {0}.r)".format(name))
-        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
-                       "dim-offset=0 dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
-                       "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+        if dropout_proportion != -1.0:
+            configs.append("component-node name={0}.cr_trunc.dropout component={0}.cr_trunc.dropout input={0}.cr_trunc".format(name))
+            configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc.dropout "
+                           "dim-offset=0 dim={1}".format(name, cell_dim))
+            configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc.dropout "
+                           "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+        else:
+            configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc "
+                           "dim-offset=0 dim={1}".format(name, cell_dim))
+            configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
+                           "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
         configs.append("### End LSTM Layer '{0}'".format(name))
+
         return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index cc786d091ac..918d8bd2fb2 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -22,15 +22,10 @@
         'relu-renorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
-        'tdnn-relu-layer' : xlayers.XconfigTdnnLayer,
-        'tdnn-relu-renorm-layer' : xlayers.XconfigTdnnLayer,
-        'tdnn-sigmoid-layer' : xlayers.XconfigTdnnLayer,
-        'tdnn-tanh-layer' : xlayers.XconfigTdnnLayer,
         'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
         'affine-layer' : xlayers.XconfigAffineLayer,
         'lstm-layer' : xlayers.XconfigLstmLayer,
         'lstmp-layer' : xlayers.XconfigLstmpLayer,
-        'lstmpc-layer' : xlayers.XconfigLstmpcLayer,
         'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
         'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer
         }
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py
deleted file mode 100644
index ed7b6f1f53c..00000000000
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/tdnn.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2016    Johns Hopkins University (Dan Povey)
-#           2016    Vijayaditya Peddinti
-# Apache 2.0.
-
-
-""" This module contains the implementation of the TDNN layer.
-"""
-
-import libs.nnet3.xconfig.utils as xutils
-from libs.nnet3.xconfig.basic_layers import XconfigBasicLayer
-from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
-
-class XconfigTdnnLayer(XconfigBasicLayer):
-    """This class is for parsing lines like
-    tdnn-relu-renorm-layer name=tdnn1 dim=1024 splice-indexes=-3,0,3 subset-dim=512
-
-    It is similar to XconfigBasicLayer except for the way in which the input
-    splicing is done. So we derive this class from XconfigBasicLayer.
-    """
-
-    def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token in [ 'tdnn-relu-layer', 'tdnn-relu-renorm-layer',
-                                'tdnn-sigmoid-layer', 'tdnn-tanh-layer' ]
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-
-    def set_default_configs(self):
-
-        super(XconfigTdnnLayer, self).set_default_configs()
-
-        self.config['splice-indexes'] = ''
-        self.config['subset-dim'] = -1
-
-    def check_configs(self):
-
-        if self.config['splice-indexes'] == '':
-            raise RuntimeError("splice-indexes must be non-empty")
-        super(XconfigTdnnLayer, self).check_configs()
-
-
-    def _generate_config(self):
-        split_layer_name = self.layer_type.split('-')
-        assert split_layer_name[-1] == 'layer'
-        # ignore the first 'tdnn' and the last 'layer'
-        nonlinearities = split_layer_name[1:-1]
-
-        # by 'descriptor_final_string' we mean a string that can appear in
-        # config-files, i.e. it contains the 'final' names of nodes.
-        input_desc = self.descriptors['input']['final-string']
-        input_dim = self.descriptors['input']['dim']
-        splice_indexes = self.get_splice_indexes()
-        input_desc, input_dim, sp_configs = self.splice_input(input_desc,
-                input_dim, splice_indexes, self.config['subset-dim'],
-                '{0}.input-subset'.format(self.name))
-
-        return sp_configs + self._add_components(input_desc, input_dim, nonlinearities)
-
-    def get_splice_indexes(self):
-        try:
-            return map(lambda x: int(x), self.config['splice-indexes'].split(","))
-        except ValueError:
-            raise RuntimeError("Invalid value for splice-indexes.")
-
-    @staticmethod
-    def splice_input(input_desc, input_dim,
-                     splice_indexes, subset_dim = -1,
-                     dim_range_node_name = None ):
-        """Convenience function to create an appended descriptor with the
-        splice_indexes.
-        """
-
-        configs = []
-        try:
-            zero_index = splice_indexes.index(0)
-        except ValueError:
-            zero_index = None
-
-        if subset_dim > 0:
-            assert(dim_range_node_name is not None)
-            # if subset_dim is specified the script expects a zero
-            # in the splice indexes
-            assert(zero_index is not None)
-            line = ("dim-range-node name={0}"
-                    " input-node={1}"
-                    " dim-offset={2}"
-                    " dim={3}"
-                    "".format(dim_range_node_name,
-                              input_desc, 0, subset_dim))
-            configs.append(line)
-            subset_desc = dim_range_node_name
-
-        else:
-            subset_desc = input_desc
-            subset_dim = input_dim
-
-        appended_descriptors = []
-        appended_dimension = 0
-        for j in range(len(splice_indexes)):
-            if j == zero_index:
-                appended_descriptors.append(input_desc)
-                appended_dimension += input_dim
-                continue
-            appended_descriptors.append('Offset({0}, {1})'.format(subset_desc, splice_indexes[j]))
-            appended_dimension += subset_dim
-        return ["Append({0})".format(", ".join(appended_descriptors)),
-                appended_dimension,
-                configs]
diff --git a/egs/wsj/s5/steps/lmrescore.sh b/egs/wsj/s5/steps/lmrescore.sh
index ba1f4487297..3cf28cd70bd 100755
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
@@ -100,7 +100,7 @@ case "$mode" in
      # grammar and transition weights.
     mdl=`dirname $indir`/final.mdl
     [ ! -f $mdl ] && echo No such model $mdl && exit 1;
-    [[ -f `dirname $indir`/frame_subsampling_factor && $self_loop_scale != 1.0 ]] && \
+    [[ -f `dirname $indir`/frame_subsampling_factor && "$self_loop_scale" == 0.1 ]] && \
       echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
     $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
       gunzip -c $indir/lat.JOB.gz \| \
diff --git a/egs/wsj/s5/steps/mixup.sh b/egs/wsj/s5/steps/mixup.sh
deleted file mode 100755
index 238d8cab77f..00000000000
--- a/egs/wsj/s5/steps/mixup.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
-
-# mix up (or down); do 3 iters of model training; realign; then do two more
-# iterations of model training.
-
-# Begin configuration section.
-cmd=run.pl
-beam=10
-retry_beam=40
-boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-num_iters=5
-realign_iters=3 # Space-separated list of iterations to realign on.
-stage=0
-# End configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f path.sh ] && . ./path.sh;
-. parse_options.sh || exit 1;
-
-if [ $# != 5 ]; then
-   echo "Usage: steps/mixup.sh <num-gauss> <data-dir> <lang-dir> <old-exp-dir> <exp-dir>"
-   echo " e.g.: steps/mixup.sh 20000 data/train_si84 data/lang exp/tri3b exp/tri3b_20k"
-   echo "main options (for others, see top of script file)"
-   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
-   echo "  --config <config-file>                           # config containing options"
-   echo "  --stage <stage>                                  # stage to do partial re-run from."
-   exit 1;
-fi
-
-numgauss=$1
-data=$2
-lang=$3
-srcdir=$4
-dir=$5
-
-for f in $data/feats.scp $srcdir/final.mdl $srcdir/final.mat; do
-  [ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1;
-done
-
-nj=`cat $srcdir/num_jobs` || exit 1;
-sdata=$data/split$nj;
-
-splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
-cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
-
-mkdir -p $dir/log
-cp $srcdir/splice_opts $dir 2>/dev/null
-cp $srcdir/cmvn_opts $dir 2>/dev/null
-cp $srcdir/final.mat $dir
-echo $nj > $dir/num_jobs
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-
-utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
-cp $lang/phones.txt $dir || exit 1;
-
-cp $srcdir/tree $dir
-
-
-## Set up features.
-if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-echo "$0: feature type is $feat_type"
-
-case $feat_type in
-  delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
-  lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
-    cp $srcdir/final.mat $dir    
-    ;;
-  *) echo "Invalid feature type $feat_type" && exit 1;
-esac
-if [ -f $srcdir/trans.1 ]; then
-  echo Using transforms from $srcdir;
-  rm $dir/trans.* 2>/dev/null
-  ln.pl $srcdir/trans.* $dir  # Link those transforms to current directory.
-  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |"
-else
-  feats="$sifeats"
-fi
-## Done setting up features.
-
-rm $dir/fsts.*.gz 2>/dev/null
-ln.pl $srcdir/fsts.*.gz $dir  # Link training-graph FSTs to current directory.
-
-## Mix up old model
-if [ $stage -le 0 ]; then
-  echo Mixing up old model to $numgauss Gaussians
-# Note: this script also works for mixing down.
-  $cmd $dir/log/mixup.log \
-    gmm-mixup --mix-up=$numgauss --mix-down=$numgauss \
-    $srcdir/final.mdl $srcdir/final.occs $dir/1.mdl || exit 1;
-fi
-## Done.
-
-cur_alidir=$srcdir # dir to find alignments.
-[ -z "$realign_iters" ] && ln.pl $srcdir/ali.*.gz $dir; # link alignments, if
- # we won't be generating them.
-
-x=1
-while [ $x -le $num_iters ]; do
-  echo "$0: iteration $x"
-  if echo $realign_iters | grep -w $x >/dev/null; then
-    if [ $stage -le $x ]; then
-      echo "$0: realigning data"
-      mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |"
-      $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
-        gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 "$mdl" \
-        "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \
-        "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
-    fi
-    cur_alidir=$dir
-  fi
-  if [ $stage -le $x ]; then
-    echo "$0: accumulating statistics"
-    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
-      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
-      "ark,s,cs:gunzip -c $cur_alidir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1;
-    echo "$0: re-estimating model"
-    [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;
-    $cmd $dir/log/update.$x.log \
-      gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
-      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
-    rm $dir/$x.mdl $dir/$x.*.acc
-    rm $dir/$x.occs  2>/dev/null
-  fi
-  x=$[$x+1]
-done
-
-rm $dir/final.mdl $dir/final.occs 2>/dev/null
-ln -s $x.mdl $dir/final.mdl
-ln -s $x.occs $dir/final.occs
-
-if [ -f $dir/trans.1 ]; then 
-  echo "$0: accumulating stats for alignment model."
-  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
-    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
-    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
-    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
-  [ "`ls $dir/$x.*.acc | wc -w`" -ne $nj ] && echo "$0: wrong #accs" && exit 1;  
-  echo "$0: Re-estimating alignment model."
-  $cmd $dir/log/est_alimdl.log \
-    gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
-    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl  || exit 1;
-  rm $dir/$x.*.acc
-  rm $dir/final.alimdl 2>/dev/null
-  ln -s $x.alimdl $dir/final.alimdl 
-fi
-
-utils/summarize_warnings.pl $dir/log
-
-echo Done
diff --git a/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
new file mode 100755
index 00000000000..40cc0d2c349
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+
+#echo >&2 "$0 $@"  # Print the command line for logging
+if [ $# != 2 ] ; then
+  echo >$2 "Usage: $0  <first-dir> <second-dir>"
+  echo >$2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem"
+fi
+
+dir_a=$1
+dir_b=$2
+
+id_a=$(steps/nnet2/get_ivector_id.sh $dir_a)
+ret_a=$?
+id_b=$(steps/nnet2/get_ivector_id.sh $dir_b)
+ret_b=$?
+
+if [ ! -z "$id_a" ] && [ ! -z "${id_b}" ] ; then
+  if [ "${id_a}" == "${id_b}" ]; then
+    exit 0
+  else
+    echo >&2 "$0: ERROR: iVector id in ${id_a} and the iVector id in ${id_a} do not match"
+    echo >&2 "$0: ERROR: that means that the systems are not compatible."
+    exit 1
+  fi
+elif [ -z "$id_a" ] && [ -z "${id_b}" ] ; then
+    echo >&2 "$0: WARNING: The directories do not contain iVector ID."
+    echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
+    echo >&2 "$0: WARNING: the directories compatible"
+    exit 0
+else
+    echo >&2 "$0: WARNING: One of the directories do not contain iVector ID."
+    echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping "
+    echo >&2 "$0: WARNING: the directories compatible"
+    exit 0
+fi
diff --git a/egs/wsj/s5/steps/nnet2/get_egs2.sh b/egs/wsj/s5/steps/nnet2/get_egs2.sh
index 69e92ef4b6f..1cd344ca686 100755
--- a/egs/wsj/s5/steps/nnet2/get_egs2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs2.sh
@@ -50,7 +50,7 @@ transform_dir=     # If supplied, overrides alidir as the place to find fMLLR tr
 postdir=        # If supplied, we will use posteriors in it as soft training targets.
 
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 online_ivector_dir=  # can be used if we are including speaker information as iVectors.
 cmvn_opts=  # can be used for specifying CMVN options, if feature type is not lda (if lda,
diff --git a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
index 5fee71e80cd..03a64e222a8 100755
--- a/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/nnet2/get_egs_discriminative2.sh
@@ -43,7 +43,7 @@ if [ $# != 6 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
diff --git a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
new file mode 100755
index 00000000000..3ec70757d5a
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2016, Johns Hopkins University (Yenda Trmal <jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+# End configuration section.
+
+#echo >&2 "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 1 ]; then
+  echo >$2 "Usage: $0 <directory>"
+  echo >$2 " e.g.: $0 exp/nnet3/extractor"
+  exit 1
+fi
+
+ivecdir=$1
+
+if [ -f $ivecdir/final.ie.id ] ; then
+  cat $ivecdir/final.ie.id
+elif [ -f $ivecdir/final.ie ] ; then
+  # note the creation can fail in case the extractor directory
+  # is not read-only media or the user des not have access rights
+  # in that case we will just behave as if the id is not available
+  id=$(md5sum $ivecdir/final.ie | awk '{print $1}')
+  echo "$id" > $ivecdir/final.ie.id || exit 1
+  cat $ivecdir/final.ie.id
+else
+  exit 0
+fi
+
+exit 0
+
+
+
diff --git a/egs/wsj/s5/steps/nnet2/retrain_fast.sh b/egs/wsj/s5/steps/nnet2/retrain_fast.sh
index 68ecdf33946..8c82c361d82 100755
--- a/egs/wsj/s5/steps/nnet2/retrain_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/retrain_fast.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2014  Johns Hopkins University (Author: Daniel Povey).
 # Apache 2.0.
 
 # retrain_fast.sh is a neural net training script that's intended to train
@@ -24,7 +24,7 @@ final_learning_rate=0.004
 
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
 num_jobs_nnet=16   # Number of neural net jobs to run in parallel.  This option
@@ -42,7 +42,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
 
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 
 alpha=4.0   # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -102,7 +102,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -115,7 +115,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -169,7 +169,7 @@ if [ $stage -le -2 ]; then
   echo "$0: initializing neural net";
 
   feat_dim=$(feat-to-dim scp:$data/feats.scp -) || exit 1;
-  
+
   online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
 
   cat >$dir/nnet.config <<EOF
@@ -251,7 +251,7 @@ while [ $x -lt $num_iters ]; do
           ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
 
     if [ $x -eq 0 ]; then
@@ -273,7 +273,7 @@ while [ $x -lt $num_iters ]; do
       perturb_suffix="-perturbed"
       perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
     fi
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -297,7 +297,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
diff --git a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
index d0a51110ac8..73cfb3d2d49 100755
--- a/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
+++ b/egs/wsj/s5/steps/nnet2/retrain_simple2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -22,7 +22,7 @@ initial_learning_rate=0.04
 final_learning_rate=0.004
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
 num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  This option
@@ -59,7 +59,7 @@ max_change_per_sample=0.075
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -67,7 +67,7 @@ combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage
 cleanup=true
 egs_dir=
 egs_opts=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
 realign_epochs=         # List of epochs, the beginning of which realignment is done
@@ -103,7 +103,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -116,7 +116,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -258,7 +258,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -268,7 +268,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       epoch=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -304,7 +304,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -341,7 +341,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -383,7 +383,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -420,7 +420,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_block.sh b/egs/wsj/s5/steps/nnet2/train_block.sh
index ec8ed7a3856..1e79bb76473 100755
--- a/egs/wsj/s5/steps/nnet2/train_block.sh
+++ b/egs/wsj/s5/steps/nnet2/train_block.sh
@@ -16,7 +16,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.0
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -53,7 +53,7 @@ block_shift=5
 
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 splice_width=7 # meaning +- 7 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -106,7 +106,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -123,7 +123,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -153,7 +153,7 @@ cp $alidir/tree $dir
 utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
 cp $lang/phones.txt $dir || exit 1;
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
     > $dir/valid_uttlist || exit 1;
 awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
@@ -200,7 +200,7 @@ if [ $stage -le -2 ]; then
   first_hidden_layer_stddev=`perl -e "print 1.0/sqrt($hidden_block_dim);"`
   stddev=`perl -e "print 1.0/sqrt($hidden_layer_dim);"`
 
-  
+
   cat >$dir/nnet.config <<EOF
 SpliceComponent input-dim=$feat_dim left-context=$splice_width right-context=$splice_width
 FixedAffineComponent matrix=$dir/lda.mat
@@ -272,10 +272,10 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     mdl=$dir/$x.mdl
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -293,16 +293,16 @@ while [ $x -lt $num_iters ]; do
     softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -317,7 +317,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -380,7 +380,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
index dfa10957e0f..9f3e9234389 100755
--- a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -36,10 +36,10 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-hidden_dim=3000 
+hidden_dim=3000
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -91,26 +91,26 @@ patch_step2=1         # patch step of the second convolutional layer
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
-combine_parallel_opts="-pe smp 8"  # queue options for the "combine" stage.
+combine_parallel_opts="--num-threads 8"  # queue options for the "combine" stage.
 cleanup=true
 egs_dir=
 lda_opts=
 lda_dim=
 egs_opts=
 delta_order=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
 postdir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -143,10 +143,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job (will affect results"
   echo "                                                   # as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -170,7 +170,7 @@ if [ $# != 4 ]; then
   echo "  --num-filters2 <num-filters2|256>                # number of filters in the second convolutional layer."
   echo "  --patch-dim2 <patch-dim2|4>                      # dim of convolutional kernel in the second layer."
 
-  
+
   exit 1;
 fi
 
@@ -225,7 +225,7 @@ feat-to-dim scp:$sdata/1/feats.scp - > $dir/feat_dim
 feat_dim=$(cat $dir/feat_dim) || exit 1;
 
 if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
-  echo "$0: calling get_egs2.sh"            
+  echo "$0: calling get_egs2.sh"
   steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
     --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \
     --cmd "$cmd" --feat-type "raw" $data $alidir $dir/egs || exit 1;
@@ -273,7 +273,7 @@ if [ $stage -le -2 ]; then
   conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1)
   pool_out_dim=$[$num_filters1*$num_pool]
   conv_out_dim2=$[$num_filters2*$num_patch2]
-  
+
   online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample"
 
   initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);")
@@ -286,7 +286,7 @@ NormalizeComponent dim=$pool_out_dim
 AffineComponentPreconditionedOnline input-dim=$pool_out_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
 SoftmaxComponent dim=$num_leaves
 EOF
-  
+
   cat >$dir/replace.1.config <<EOF
 Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2 appended-conv=true
 NormalizeComponent dim=$conv_out_dim2
@@ -303,7 +303,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/replace.3.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
 RectifiedLinearComponent dim=$hidden_dim
@@ -409,7 +409,7 @@ while [ $x -lt $num_iters ]; do
 
   # TODO: remove this line.
   echo "On iteration $x, learning rate is $this_learning_rate."
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -419,7 +419,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       time=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -455,7 +455,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -463,7 +463,7 @@ while [ $x -lt $num_iters ]; do
     $cmd $dir/log/compute_prob_train.$x.log \
       nnet-compute-prob $dir/$x.mdl ark:$cur_egs_dir/train_diagnostic.egs &
     if [ $x -gt 0 ] && [ ! -f $dir/log/mix_up.$[$x-1].log ]; then
-      [ ! -f $x.mdl ] && sleep 10; 
+      [ ! -f $x.mdl ] && sleep 10;
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl \
         ark:$cur_egs_dir/train_diagnostic.egs '&&' \
@@ -499,7 +499,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -538,7 +538,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       cp $dir/$[$x+1].$n.mdl $dir/$[$x+1].mdl || exit 1;
@@ -575,7 +575,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
index 82156fed39f..27d1313446d 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multisplice_accel2.sh
@@ -67,7 +67,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -136,7 +136,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
index a5cef8aea44..247d452e714 100755
--- a/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
+++ b/egs/wsj/s5/steps/nnet2/train_multisplice_ensemble.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -21,11 +21,11 @@ num_epochs=15      # Number of epochs of training;
 initial_effective_lrate=0.01
 final_effective_lrate=0.001
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -62,7 +62,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -74,7 +74,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -86,12 +86,12 @@ lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
 postdir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
 align_use_gpu=          # Passed to use_gpu in steps/nnet2/align.sh [yes/no]
-realign_times=          # List of times on which we realign.  Each time is 
+realign_times=          # List of times on which we realign.  Each time is
                         # floating point number strictly between 0 and 1, which
                         # will be multiplied by the num-iters to get an iteration
                         # number.
@@ -131,7 +131,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -149,7 +149,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -369,7 +369,7 @@ while [ $x -lt $num_iters ]; do
   ilr=$initial_effective_lrate; flr=$final_effective_lrate; np=$num_archives_processed; nt=$num_archives_to_process;
   this_learning_rate=$(perl -e "print (($x + 1 >= $num_iters ? $flr : $ilr*exp($np*log($flr/$ilr)/$nt))*$this_num_jobs);");
 
-  echo "On iteration $x, learning rate is $this_learning_rate."    
+  echo "On iteration $x, learning rate is $this_learning_rate."
 
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
@@ -414,7 +414,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -463,7 +463,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -483,7 +483,7 @@ while [ $x -lt $num_iters ]; do
         nnets_ensemble_out="${nnets_ensemble_out} $dir/$[$x+1].$n.$i.mdl "
       done
 
-      beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`; 
+      beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`;
 
 
         $cmd $parallel_opts $dir/log/train.$x.$n.log \
@@ -499,12 +499,12 @@ while [ $x -lt $num_iters ]; do
     # have printed a more specific one.
     [ -f $dir/.error ] && echo "$0: error on iteration $x of training" && exit 1;
 
-    for i in `seq 1 $ensemble_size`; do 
+    for i in `seq 1 $ensemble_size`; do
       nnets_list=
       for n in `seq 1 $this_num_jobs`; do
         nnets_list="$nnets_list $dir/$[$x+1].$n.$i.mdl"
       done
-    
+
       if $do_average; then
         # average the output of the different jobs.
         $cmd $dir/log/average.$x.log \
@@ -514,12 +514,12 @@ while [ $x -lt $num_iters ]; do
         n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
             $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
             undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-            close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+            close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
             $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
         [ -z "$n" ] && echo "Error getting best model" && exit 1;
         cp $dir/$[$x+1].$n.$i.mdl $dir/$[$x+1].$i.mdl || exit 1;
       fi
-  
+
       if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
         # mix up.
         echo Mixing up from $num_leaves to $mix_up components
@@ -545,7 +545,7 @@ if [ $stage -le $num_iters ]; then
 
 (
   # Now do combination.
-  for i in `seq 1 $ensemble_size`; do 
+  for i in `seq 1 $ensemble_size`; do
     # Now do combination.
     nnets_list=()
     # the if..else..fi statement below sets 'nnets_list'.
@@ -555,7 +555,7 @@ if [ $stage -le $num_iters ]; then
       cur_offset=0 # current offset from first_model_combine.
       for n in $(seq $max_models_combine); do
         next_offset=$[($n*$num_models_combine)/$max_models_combine]
-        sub_list="" 
+        sub_list=""
         for o in $(seq $cur_offset $[$next_offset-1]); do
           iter=$[$first_model_combine+$o]
           mdl=$dir/$iter.$i.mdl
@@ -586,13 +586,13 @@ if [ $stage -le $num_iters ]; then
     # nnet-combine-fast uses for scaling, which after flooring and inversion, has
     # the effect that the initial model chosen gets much higher learning rates
     # than the others.  This prevents the optimization from working well.
-    
+
     $cmd $combine_parallel_opts  $dir/log/combine.$i.log \
     nnet-combine-fast --initial-model=100000 --num-lbfgs-iters=40 --use-gpu=no \
       --num-threads=$combine_num_threads \
       --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$cur_egs_dir/combine.egs \
       $dir/final.$i.mdl || touch $dir/.error &
-    
+
     [ -f $dir/.error ] && echo "$0: error when combining models." && exit 1;
     rm $dir/.error 2>/dev/null
   done
@@ -602,7 +602,7 @@ if [ $stage -le $num_iters ]; then
   # pnorm layer and then a normalize layer.
   $cmd JOB=1:$ensemble_size $dir/log/normalize.JOB.log \
     nnet-normalize-stddev $dir/final.JOB.mdl $dir/final.JOB.mdl || exit 1;
- 
+
   # Compute the probability of the final, combined model with
   # the same subset we used for the previous compute_probs, as the
   # different subsets will lead to different probs.
@@ -613,7 +613,7 @@ if [ $stage -le $num_iters ]; then
 fi
 
 if [ $stage -le $[$num_iters+1] ]; then
-  for i in `seq 1 $ensemble_size`; do 
+  for i in `seq 1 $ensemble_size`; do
     echo "Getting average posterior for purposes of adjusting the priors."
     # Note: this just uses CPUs, using a smallish subset of data.
     rm $dir/post.$x.*.vec 2>/dev/null
@@ -622,14 +622,14 @@ if [ $stage -le $[$num_iters+1] ]; then
       nnet-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
       nnet-compute-from-egs "nnet-to-raw-nnet $dir/final.$i.mdl -|" ark:- ark:- \| \
       matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$x.JOB.vec || exit 1;
-  
+
     sleep 3;  # make sure there is time for $dir/post.$x.*.vec to appear.
-  
+
     $cmd $dir/log/vector_sum.$x.log \
      vector-sum $dir/post.$x.*.vec $dir/post.$x.vec || exit 1;
-  
+
     rm $dir/post.$x.*.vec;
-  
+
     echo "Re-adjusting priors based on computed posteriors"
     $cmd $dir/log/adjust_priors.final.log \
       nnet-adjust-priors $dir/final.$i.mdl $dir/post.$x.vec $dir/final.$i.mdl || exit 1;
@@ -656,7 +656,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    for i in `seq 1 $ensemble_size`; do 
+    for i in `seq 1 $ensemble_size`; do
       if [ $[$x%100] -ne 0 ] && [ $x -ne $num_iters ] && [ -f $dir/$x.$i.mdl ]; then
        # delete all but every 100th model; don't delete the ones which combine to form the final model.
         rm $dir/$x.$i.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm.sh b/egs/wsj/s5/steps/nnet2/train_pnorm.sh
index 93ea7970e22..1e47d84f155 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm.sh
@@ -1,19 +1,19 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
 # Apache 2.0.
 
 
-# This script trains neural network with pnorm nonlinearities. 
-# The difference with train_tanh.sh is that, instead of setting 
+# This script trains neural network with pnorm nonlinearities.
+# The difference with train_tanh.sh is that, instead of setting
 # hidden_layer_size, you should set pnorm_input_dim and pnorm_output_dim.
 # Also the P value (the order of the p-norm) should be set.
-# 
+#
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -30,7 +30,7 @@ bias_stddev=0.5
 softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
 
 combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
@@ -56,7 +56,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -72,7 +72,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -118,7 +118,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -137,7 +137,7 @@ if [ $# != 4 ]; then
   echo "  --num-jobs-align <#njobs|30>                     # Number of jobs to perform realignment"
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -233,7 +233,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -286,7 +286,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -328,7 +328,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -341,7 +341,7 @@ while [ $x -lt $num_iters ]; do
         ark:$cur_egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
@@ -369,16 +369,16 @@ while [ $x -lt $num_iters ]; do
     softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -479,7 +479,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 100th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
index 057a29af306..1b2cac6b441 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_accel2.sh
@@ -87,7 +87,7 @@ egs_dir=
 lda_opts=
 lda_dim=
 egs_opts=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
 postdir=
 cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
@@ -131,7 +131,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh
index 338acee355c..2ad328f06aa 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_bottleneck_fast.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2014  Pegah Ghahremani
 # Apache 2.0.
 
@@ -25,13 +25,13 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 bottleneck_dim=42  # bottleneck layer dimensio
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -53,7 +53,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
@@ -80,13 +80,13 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
                         # more than enough.
 bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer
-                                          # eg. 2000|2000|420|2000 bottleneck_layer_num = 2    
+                                          # eg. 2000|2000|420|2000 bottleneck_layer_num = 2
 # End configuration section.
 
 
@@ -124,7 +124,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -139,7 +139,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -234,7 +234,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -253,7 +253,7 @@ PnormComponent input-dim=$bnf_input_dim output-dim=$bnf_output_dim p=$p
 NormalizeComponent dim=$bnf_output_dim
 AffineComponentPreconditionedOnline input-dim=$bnf_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim  p=$p
-NormalizeComponent dim=$pnorm_output_dim 
+NormalizeComponent dim=$pnorm_output_dim
 EOF
   $cmd $dir/log/nnet_init.log \
     nnet-am-init $alidir/tree $lang/topo "nnet-init $dir/nnet.config -|" \
@@ -333,16 +333,16 @@ while [ $x -lt $num_iters ]; do
           ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
 
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
-      if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then 
-        echo bnf layer with x = $x 
+      if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then
+        echo bnf layer with x = $x
         mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"
-      else 
+      else
         mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
       fi
     else
@@ -368,7 +368,7 @@ while [ $x -lt $num_iters ]; do
       perturb_suffix="-perturbed"
       perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
     fi
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -392,7 +392,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -500,6 +500,6 @@ fi
 name=`basename $data`
 if [ -f $dir/final.mdl ]; then
   nnet-to-raw-nnet --truncate=$truncate_comp_num $dir/final.mdl $dir/final.raw
-else 
+else
   echo "$0: we require final.mdl in source dir $dir"
 fi
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh
index 69ab4596f29..cdb63aa7863 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_ensemble.sh
@@ -1,18 +1,18 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Guoguo Chen
 #           2014  Xiaohui Zhang
 # Apache 2.0.
 
 
-# This script trains an ensemble of neural networks with pnorm nonlinearities. 
-# An ensemble of nets are first differently initialized, and then trained using the 
-# same data during each iteration. In each training iteration, one term is added to 
-# the objf, which is beta times the cross-entropy between the current net's posterior 
-# output and the geometrically averaged posterior outputs of the ensemble of nets. 
+# This script trains an ensemble of neural networks with pnorm nonlinearities.
+# An ensemble of nets are first differently initialized, and then trained using the
+# same data during each iteration. In each training iteration, one term is added to
+# the objf, which is beta times the cross-entropy between the current net's posterior
+# output and the geometrically averaged posterior outputs of the ensemble of nets.
 # The beta values obey an exponentially increasing schedule (determined by initial_beta
-# and final_beta). 
+# and final_beta).
 
 # Begin configuration section.
 cmd=run.pl
@@ -28,7 +28,7 @@ bias_stddev=0.5
 softmax_learning_rate_factor=1.0 # In the default setting keep the same learning rate.
 
 combine_regularizer=1.0e-14 # Small regularizer so that parameters won't go crazy.
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
@@ -109,7 +109,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -126,7 +126,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -210,7 +210,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -265,7 +265,7 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].1.mdl $dir/$x.1.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     declare -A mdl
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
@@ -287,7 +287,7 @@ while [ $x -lt $num_iters ]; do
       nnets_ensemble_out="${nnets_ensemble_out} $dir/$[$x+1].JOB.$i.mdl "
     done
 
-    beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`; 
+    beta=`perl -e '($x,$n,$i,$f)=@ARGV; print ($i+$x*($f-$i)/$n);' $[$x+1] $num_iters $initial_beta $final_beta`;
 
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
@@ -301,17 +301,17 @@ while [ $x -lt $num_iters ]; do
     softmax_learning_rate=`perl -e "print $learning_rate * $softmax_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$softmax_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
-    for i in `seq 1 $ensemble_size`; do 
+
+    for i in `seq 1 $ensemble_size`; do
       nnets_list=
       for n in `seq 1 $num_jobs_nnet`; do
         nnets_list="$nnets_list $dir/$[$x+1].$n.$i.mdl"
@@ -335,7 +335,7 @@ done
 # Now do combination.
 # At the end, final.mdl will be a combination of the last e.g. 10 models.
 
-for i in `seq 1 $ensemble_size`; do 
+for i in `seq 1 $ensemble_size`; do
   nnets_list=()
   if [ $num_iters_final -gt $num_iters_extra ]; then
     echo "Setting num_iters_final=$num_iters_extra"
@@ -347,7 +347,7 @@ for i in `seq 1 $ensemble_size`; do
       nnets_list[$idx]=$dir/$x.$i.mdl # "nnet-am-copy --remove-dropout=true $dir/$x.mdl - |"
     fi
   done
-  
+
   if [ $stage -le $num_iters ]; then
     # Below, use --use-gpu=no to disable nnet-combine-fast from using a GPU, as
     # if there are many models it can give out-of-memory error; set num-threads to 8
@@ -370,7 +370,7 @@ for i in `seq 1 $ensemble_size`; do
         --initial-model=100000 --num-lbfgs-iters=40 \
         --verbose=3 --minibatch-size=$mb "${nnets_list[@]}" ark:$egs_dir/combine.egs \
         $dir/final.$i.mdl || exit 1;
-  
+
     # Normalize stddev for affine or block affine layers that are followed by a
     # pnorm layer and then a normalize layer.
     $cmd $parallel_opts $dir/log/normalize.$i.log \
@@ -397,9 +397,9 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
-      for i in `seq 1 $ensemble_size`; do 
+      for i in `seq 1 $ensemble_size`; do
         rm $dir/$x.$i.mdl
       done
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
index d655f039e2f..497d2826f48 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_fast.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 # Apache 2.0.
@@ -26,14 +26,14 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
-                                   # to scale the pre-softmax outputs  
+                                   # to scale the pre-softmax outputs
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -82,7 +82,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -126,7 +126,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -141,7 +141,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -234,7 +234,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -255,16 +255,16 @@ if [ $stage -le -1 ]; then
     echo "prepare vector assignment for FixedScaleComponent before softmax"
     echo "(use priors^$presoftmax_prior_scale_power and rescale to average 1)"
 
-    # obtains raw pdf count    
+    # obtains raw pdf count
     $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
     cat $dir/*.pacc > $dir/pacc
     rm $dir/*.pacc
     awk -v power=$presoftmax_prior_scale_power \
-      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } 
+      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
       END {
-        for (i=2; i<=NF-1; i++) {total+=sum[i]} 
+        for (i=2; i<=NF-1; i++) {total+=sum[i]}
         ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
         for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
         rescale/=(NF-2)
@@ -345,13 +345,13 @@ while [ $x -lt $num_iters ]; do
           ark:$egs_dir/train_diagnostic.egs '&&' \
         nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
 
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
-      
+
       inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`
       if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
         inp=$[$inp-2]
@@ -383,7 +383,7 @@ while [ $x -lt $num_iters ]; do
       perturb_suffix="-perturbed"
       perturb_opts="--target-objf-change=$target_objf_change --within-covar=$dir/within_covar.spmat"
     fi
-    
+
     $cmd $parallel_opts JOB=1:$num_jobs_nnet $dir/log/train.$x.JOB.log \
       nnet-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x \
       ark:$egs_dir/egs.JOB.$[$x%$iters_per_epoch].ark ark:- \| \
@@ -407,7 +407,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -416,7 +416,7 @@ while [ $x -lt $num_iters ]; do
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
       echo "Warning: the mix up opertion is disabled!"
-      echo "    Ignore mix up leaves number specified" 
+      echo "    Ignore mix up leaves number specified"
     fi
     rm $nnets_list
     [ ! -f $dir/$[$x+1].mdl ] && exit 1;
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
index a7494575a24..ca7be971f0d 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -12,14 +12,14 @@
 # which is faster (especially on GPUs).  The difference is that the
 # learning-rate schedule is simpler, with the learning rate exponentially
 # decreasing during training, and no phase where the learning rate is constant.
-# 
+#
 # Also, the final model-combination is done a bit differently: we combine models
 # over typically a whole epoch, and because that would be too many iterations to
 # easily be able to combine over, we arrange the iterations into groups (20
 # groups by default) and average over each group.
 #
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -29,11 +29,11 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -60,7 +60,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-4
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # Format : layer<hidden_layer>/<frame_indices>....layer<hidden_layer>/<frame_indices> "
 # note: hidden layers which are composed of one or more components,
@@ -77,7 +77,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -88,7 +88,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -130,7 +130,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -148,7 +148,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -269,7 +269,7 @@ fi
 
 cur_num_hidden_layer=1  # counts the number of hidden layers in the network
                         # this is different from the number of components in
-                        # in the network, each hidden layer is composed of 
+                        # in the network, each hidden layer is composed of
                         # affine comp. + pnorm comp. + normalization comp.
                         # optionally a splice component is also added
 
@@ -328,7 +328,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -370,7 +370,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -432,7 +432,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -469,7 +469,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
index 5bc49489a44..069dea9ffcc 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_multisplice2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -21,11 +21,11 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -60,7 +60,7 @@ splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
 # so hidden layer indexing is different from component count
 
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
 update_period=4 # relates to online preconditioning: says how often we update the subspace.
@@ -72,7 +72,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -83,7 +83,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
@@ -123,7 +123,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -141,7 +141,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -328,7 +328,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -372,7 +372,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -415,7 +415,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -457,7 +457,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -494,7 +494,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
index d53bfb888db..f70a9bacbaf 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2014  Vimal Manohar
@@ -12,14 +12,14 @@
 # (especially on GPUs).  The difference is that the learning-rate schedule is
 # simpler, with the learning rate exponentially decreasing during training,
 # and no phase where the learning rate is constant.
-# 
+#
 # Also, the final model-combination is done a bit differently: we combine models
 # over typically a whole epoch, and because that would be too many iterations to
 # easily be able to combine over, we arrange the iterations into groups (20
 # groups by default) and average over each group.
 #
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -29,12 +29,12 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -61,7 +61,7 @@ add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
 stage=-4
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
@@ -74,7 +74,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -85,7 +85,7 @@ lda_opts=
 lda_dim=
 egs_opts=
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -127,7 +127,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -143,7 +143,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -241,7 +241,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -306,7 +306,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -348,7 +348,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -409,7 +409,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -446,7 +446,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
index 115420281b8..ada91f2765f 100755
--- a/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
+++ b/egs/wsj/s5/steps/nnet2/train_pnorm_simple2.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).
 #                2013  Xiaohui Zhang
 #                2013  Guoguo Chen
 #                2014  Vimal Manohar
@@ -15,14 +15,14 @@
 # (especially on GPUs).  The difference is that the learning-rate schedule is
 # simpler, with the learning rate exponentially decreasing during training,
 # and no phase where the learning rate is constant.
-# 
+#
 # Also, the final model-combination is done a bit differently: we combine models
 # over typically a whole epoch, and because that would be too many iterations to
 # easily be able to combine over, we arrange the iterations into groups (20
 # groups by default) and average over each group.
 #
 # [Vimal Manohar - Oct 2014]
-# The script now supports realignment during training, which can be done by 
+# The script now supports realignment during training, which can be done by
 # specifying realign_epochs.
 
 # Begin configuration section.
@@ -32,14 +32,14 @@ num_epochs=15      # Number of epochs of training;
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-pnorm_input_dim=3000 
+pnorm_input_dim=3000
 pnorm_output_dim=300
 p=2
 presoftmax_prior_scale_power=-0.25 # use the specified power value on the priors (inverse priors)
                                    # to scale the pre-softmax outputs
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=400000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh
@@ -83,7 +83,7 @@ precondition_rank_out=80 # relates to online preconditioning
 mix_up=0 # Number of components to mix up to (should be > #tree leaves, if
         # specified.)
 num_threads=16
-parallel_opts="--num-threads 16 --mem 1G" 
+parallel_opts="--num-threads 16 --mem 1G"
   # by default we use 16 threads; this lets the queue know.
   # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads.
 combine_num_threads=8
@@ -93,9 +93,9 @@ egs_dir=
 lda_opts=
 lda_dim=
 egs_opts=
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 transform_dir=     # If supplied, overrides alidir
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 align_cmd=              # The cmd that is passed to steps/nnet2/align.sh
@@ -137,7 +137,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -153,7 +153,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-4>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
 
-  
+
   exit 1;
 fi
 
@@ -211,7 +211,7 @@ ivector_dim=$(cat $dir/ivector_dim) || exit 1;
 lda_dim=$(cat $dir/lda_dim) || exit 1;
 
 if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then
-  echo "$0: calling get_egs2.sh"            
+  echo "$0: calling get_egs2.sh"
   steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}"  --io-opts "$io_opts" \
     --samples-per-iter $samples_per_iter --stage $get_egs_stage \
     --cmd "$cmd" $egs_opts $data $alidir $dir/egs || exit 1;
@@ -258,7 +258,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$pnorm_output_dim output-dim=$pnorm_input_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 PnormComponent input-dim=$pnorm_input_dim output-dim=$pnorm_output_dim p=$p
@@ -279,16 +279,16 @@ if [ $stage -le -1 ]; then
     echo "prepare vector assignment for FixedScaleComponent before softmax"
     echo "(use priors^$presoftmax_prior_scale_power and rescale to average 1)"
 
-    # obtains raw pdf count    
+    # obtains raw pdf count
     $cmd JOB=1:$nj $dir/log/acc_pdf.JOB.log \
       ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
       post-to-tacc --per-pdf=true --binary=false $alidir/final.mdl ark:- $dir/JOB.pacc || exit 1;
     cat $dir/*.pacc > $dir/pacc
     rm $dir/*.pacc
     awk -v power=$presoftmax_prior_scale_power \
-      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} } 
+      '{ for(i=2; i<=NF-1; i++) {sum[i]+=$i} }
       END {
-        for (i=2; i<=NF-1; i++) {total+=sum[i]} 
+        for (i=2; i<=NF-1; i++) {total+=sum[i]}
         ave_pdf=int(total/(NF-2)); total+=0.01*ave_pdf*(NF-2)
         for (i=2; i<=NF-1; i++) {rescale+=((sum[i]+0.01*ave_pdf)/total)^power}
         rescale/=(NF-2)
@@ -299,7 +299,7 @@ if [ $stage -le -1 ]; then
     echo "insert an additional layer of FixedScaleComponent before softmax"
     inp=`nnet-am-info $dir/0.mdl | grep 'Softmax' | awk '{print $2}'`
     nnet-init $dir/per_element.config - | nnet-insert --insert-at=$inp --randomize-next-component=false $dir/0.mdl - $dir/0.mdl
-  fi  
+  fi
 fi
 
 # set num_iters so that as close as possible, we process the data $num_epochs
@@ -357,7 +357,7 @@ done
 cur_egs_dir=$egs_dir
 
 while [ $x -lt $num_iters ]; do
-    
+
   if [ ! -z "${realign_this_iter[$x]}" ]; then
     prev_egs_dir=$cur_egs_dir
     cur_egs_dir=$dir/egs_${realign_this_iter[$x]}
@@ -367,7 +367,7 @@ while [ $x -lt $num_iters ]; do
     if [ ! -z "${realign_this_iter[$x]}" ]; then
       epoch=${realign_this_iter[$x]}
 
-             
+
 
       echo "Getting average posterior for purposes of adjusting the priors."
       # Note: this just uses CPUs, using a smallish subset of data.
@@ -403,7 +403,7 @@ while [ $x -lt $num_iters ]; do
         steps/nnet2/remove_egs.sh $prev_egs_dir
       fi
     fi
-    
+
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd $dir/log/compute_prob_valid.$x.log \
@@ -423,12 +423,12 @@ while [ $x -lt $num_iters ]; do
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
 
-      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`        
+      inp=`nnet-am-info $dir/$x.mdl | grep 'Softmax' | awk '{print $2}'`
       if [ "$presoftmax_prior_scale_power" != "0.0" ]; then
         inp=$[$inp-2]
       else
         inp=$[$inp-1]
-      fi        
+      fi
       mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert --insert-at=$inp $dir/$x.mdl - - |"
     else
       mdl=$dir/$x.mdl
@@ -452,7 +452,7 @@ while [ $x -lt $num_iters ]; do
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
-      
+
       # We can't easily use a single parallel SGE job to do the main training,
       # because the computation of which archive and which --frame option
       # to use for each job is a little complex, so we spawn each one separately.
@@ -494,7 +494,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -503,7 +503,7 @@ while [ $x -lt $num_iters ]; do
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
       echo "Warning: the mix up opertion is disabled!"
-      echo "    Ignore mix up leaves number specified" 
+      echo "    Ignore mix up leaves number specified"
     fi
     rm $nnets_list
     [ ! -f $dir/$[$x+1].mdl ] && exit 1;
@@ -528,7 +528,7 @@ if [ $stage -le $num_iters ]; then
     cur_offset=0 # current offset from first_model_combine.
     for n in $(seq $max_models_combine); do
       next_offset=$[($n*$num_models_combine)/$max_models_combine]
-      sub_list="" 
+      sub_list=""
       for o in $(seq $cur_offset $[$next_offset-1]); do
         iter=$[$first_model_combine+$o]
         mdl=$dir/$iter.mdl
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh.sh b/egs/wsj/s5/steps/nnet2/train_tanh.sh
index d4ec6412be9..a6530ba4dfc 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
@@ -15,7 +15,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -51,7 +51,7 @@ last_layer_factor=0.1 # relates to modify_learning_rates.
 first_layer_factor=1.0 # relates to modify_learning_rates.
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -66,7 +66,7 @@ egs_dir=
 lda_opts=
 egs_opts=
 transform_dir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # can be used to force "raw" feature type.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -110,7 +110,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
@@ -122,7 +122,7 @@ if [ $# != 4 ]; then
   echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -139,7 +139,7 @@ done
 
 # Set some variables.
 num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
- 
+
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
 # in this dir we'll have just one job.
 sdata=$data/split$nj
@@ -210,7 +210,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 TanhComponent dim=$hidden_layer_dim
@@ -267,7 +267,7 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
@@ -295,17 +295,17 @@ while [ $x -lt $num_iters ]; do
     last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     # The last two layers will get this (usually lower) learning rate.
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -327,7 +327,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -411,7 +411,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh b/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh
index b296e95416b..3254581f642 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh_bottleneck.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 #	    2014  Pegah Ghahremani
@@ -15,7 +15,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -47,8 +47,8 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
 
 add_layers_period=2 # by default, add new layers every 2 iterations.
 num_hidden_layers=3
-bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer 
-                                        # eg. 1024|1024|42|1024 bottleneck_layer_num = 2  
+bottleneck_layer_num=$num_hidden_layers-2 # bottleneck layer number between hidden layer
+                                        # eg. 1024|1024|42|1024 bottleneck_layer_num = 2
 
 modify_learning_rates=false
 last_layer_factor=0.1 # relates to modify_learning_rates.
@@ -111,7 +111,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
@@ -128,7 +128,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -145,7 +145,7 @@ done
 
 # Set some variables.
 num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
- 
+
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
 # in this dir we'll have just one job.
 sdata=$data/split$nj
@@ -212,7 +212,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditioned input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim alpha=$alpha max-change=$max_change learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 TanhComponent dim=$hidden_layer_dim
@@ -270,14 +270,14 @@ while [ $x -lt $num_iters ]; do
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
       [ $[($x-1) % $add_layers_period] -eq 0 ]; then
       if [ $[($x-1) / $add_layers_period] -eq $[($num_hidden_layers-2)] ]; then
         echo bnf layer with x = $x
-        mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"  
+        mdl="nnet-init --srand=$x $dir/bnf.config - | nnet-insert $dir/$x.mdl - - |"
       else
         mdl="nnet-init --srand=$x $dir/hidden.config - | nnet-insert $dir/$x.mdl - - |"
       fi
@@ -302,17 +302,17 @@ while [ $x -lt $num_iters ]; do
     last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     # The last two layers will get this (usually lower) learning rate.
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
-    
+
     $cmd $dir/log/average.$x.log \
       nnet-am-average $nnets_list - \| \
       nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -334,7 +334,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -397,7 +397,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
@@ -407,7 +407,7 @@ fi
 name=`basename $data`
 if [ -f $dir/final.mdl ]; then
   nnet-to-raw-nnet --truncate=$truncate_comp_num $dir/final.mdl $dir/final.raw
-else 
+else
   echo "$0: we require final.mdl in source dir $dir"
 fi
 
diff --git a/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh b/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
index ec228077a4a..1be38c550d7 100755
--- a/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
+++ b/egs/wsj/s5/steps/nnet2/train_tanh_fast.sh
@@ -1,4 +1,4 @@
-#!/bin/bash 
+#!/bin/bash
 
 # Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
@@ -25,7 +25,7 @@ num_iters_final=20 # Maximum number of final iterations to give to the
 initial_learning_rate=0.04
 final_learning_rate=0.004
 bias_stddev=0.5
-shrink_interval=5 # shrink every $shrink_interval iters except while we are 
+shrink_interval=5 # shrink every $shrink_interval iters except while we are
                   # still adding layers, when we do it every iter.
 shrink=true
 num_frames_shrink=2000 # note: must be <= --num-frames-diagnostic option to get_egs.sh, if
@@ -37,7 +37,7 @@ hidden_layer_dim=300 #  You may want this larger, e.g. 1024 or 2048.
 
 minibatch_size=128 # by default use a smallish minibatch size for neural net
                    # training; this controls instability which would otherwise
-                   # be a problem with multi-threaded update. 
+                   # be a problem with multi-threaded update.
 
 samples_per_iter=200000 # each iteration of training, see this many samples
                         # per job.  This option is passed to get_egs.sh.
@@ -57,7 +57,7 @@ num_hidden_layers=3 # This is an important configuration value that you might
                     # want to tune.
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0 # relates to preconditioning.
@@ -80,7 +80,7 @@ egs_dir=
 lda_opts=
 egs_opts=
 transform_dir=
-cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.  
+cmvn_opts=  # will be passed to get_lda.sh and get_egs.sh, if supplied.
             # only relevant for "raw" features, not lda.
 feat_type=  # Can be used to force "raw" features.
 prior_subset_size=10000 # 10k samples per job, for computing priors.  Should be
@@ -124,7 +124,7 @@ if [ $# != 4 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|200000>             # Number of samples of data to process per iteration, per"
@@ -136,7 +136,7 @@ if [ $# != 4 ]; then
   echo "                                                   # interpolate parameters (the weights are learned with a validation set)"
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -153,7 +153,7 @@ done
 
 # Set some variables.
 num_leaves=`am-info $alidir/final.mdl 2>/dev/null | awk '/number of pdfs/{print $NF}'` || exit 1;
- 
+
 nj=`cat $alidir/num_jobs` || exit 1;  # number of jobs in alignment dir...
 # in this dir we'll have just one job.
 sdata=$data/split$nj
@@ -226,7 +226,7 @@ SoftmaxComponent dim=$num_leaves
 EOF
 
   # to hidden.config it will write the part of the config corresponding to a
-  # single hidden layer; we need this to add new layers. 
+  # single hidden layer; we need this to add new layers.
   cat >$dir/hidden.config <<EOF
 AffineComponentPreconditionedOnline input-dim=$hidden_layer_dim output-dim=$hidden_layer_dim $online_preconditioning_opts learning-rate=$initial_learning_rate param-stddev=$stddev bias-stddev=$bias_stddev
 TanhComponent dim=$hidden_layer_dim
@@ -284,7 +284,7 @@ while [ $x -lt $num_iters ]; do
          ark:$egs_dir/train_diagnostic.egs '&&' \
          nnet-am-info $dir/$x.mdl &
     fi
-    
+
     echo "Training neural net (pass $x)"
     if [ $x -gt 0 ] && \
       [ $x -le $[($num_hidden_layers-1)*$add_layers_period] ] && \
@@ -324,18 +324,18 @@ while [ $x -lt $num_iters ]; do
     last_layer_learning_rate=`perl -e "print $learning_rate * $final_learning_rate_factor;"`;
     nnet-am-info $dir/$[$x+1].1.mdl > $dir/foo  2>/dev/null || exit 1
     nu=`cat $dir/foo | grep num-updatable-components | awk '{print $2}'`
-    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l` 
+    na=`cat $dir/foo | grep -v Fixed | grep AffineComponent | wc -l`
     # na is number of last updatable AffineComponent layer [one-based, counting only
     # updatable components.]
     # The last two layers will get this (usually lower) learning rate.
     lr_string="$learning_rate"
-    for n in `seq 2 $nu`; do 
+    for n in `seq 2 $nu`; do
       if [ $n -eq $na ] || [ $n -eq $[$na-1] ]; then lr=$last_layer_learning_rate;
       else lr=$learning_rate; fi
       lr_string="$lr_string:$lr"
     done
 
-    if $do_average; then    
+    if $do_average; then
       $cmd $dir/log/average.$x.log \
         nnet-am-average $nnets_list - \| \
         nnet-am-copy --learning-rates=$lr_string - $dir/$[$x+1].mdl || exit 1;
@@ -343,7 +343,7 @@ while [ $x -lt $num_iters ]; do
       n=$(perl -e '($nj,$pat)=@ARGV; $best_n=1; $best_logprob=-1.0e+10; for ($n=1;$n<=$nj;$n++) {
           $fn = sprintf($pat,$n); open(F, "<$fn") || die "Error opening log file $fn";
           undef $logprob; while (<F>) { if (m/log-prob-per-frame=(\S+)/) { $logprob=$1; } }
-          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob; 
+          close(F); if (defined $logprob && $logprob > $best_logprob) { $best_logprob=$logprob;
           $best_n=$n; } } print "$best_n\n"; ' $num_jobs_nnet $dir/log/train.$x.%d.log) || exit 1;
       [ -z "$n" ] && echo "Error getting best model" && exit 1;
       $cmd $dir/log/select.$x.log \
@@ -361,7 +361,7 @@ while [ $x -lt $num_iters ]; do
     else
       # On other iters, do nnet-am-fix which is much faster and has roughly
       # the same effect.
-      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log 
+      nnet-am-fix $dir/$[$x+1].mdl $dir/$[$x+1].mdl 2>$dir/log/fix.$x.log
     fi
 
     if [ "$mix_up" -gt 0 ] && [ $x -eq $mix_up_iter ]; then
@@ -444,7 +444,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%100] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet2/update_nnet.sh b/egs/wsj/s5/steps/nnet2/update_nnet.sh
index bfbc4a6592d..abcebce273a 100755
--- a/egs/wsj/s5/steps/nnet2/update_nnet.sh
+++ b/egs/wsj/s5/steps/nnet2/update_nnet.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey). 
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
 #           2013  Johns Hopkins University (Author: Jan Trmal)
@@ -40,7 +40,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
 
 stage=-5
 
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.   These don't
 splice_width=4 # meaning +- 4 frames on each side for second LDA
 randprune=4.0 # speeds up LDA.
 alpha=4.0
@@ -83,7 +83,7 @@ if [ $# != 5 ]; then
   echo "                                                   # this, you may want to decrease the batch size."
   echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
   echo "                                                   # use multiple threads... "
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -100,7 +100,7 @@ if [ $# != 5 ]; then
   echo "  --stage <stage|-9>                               # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
   echo "  --transform-dir                                  # Directory with fMLLR transforms. Overrides alidir if provided."
-  
+
   exit 1;
 fi
 
@@ -182,12 +182,12 @@ while [ $x -lt $num_iters ]; do
       nnet-compute-prob $dir/$x.mdl ark:$egs_dir/valid_diagnostic.egs &
     $cmd $dir/log/compute_prob_train.$x.log \
       nnet-compute-prob $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
-      
+
     if [ $x -gt 0 ] ; then
       $cmd $dir/log/progress.$x.log \
         nnet-show-progress --use-gpu=no $dir/$[$x-1].mdl $dir/$x.mdl ark:$egs_dir/train_diagnostic.egs &
     fi
-    
+
     echo "Training neural net (pass $x)"
     mdl=$dir/$x.mdl
 
@@ -268,7 +268,7 @@ if $cleanup; then
   fi
   echo Removing most of the models
   for x in `seq 0 $num_iters`; do
-    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then 
+    if [ $[$x%10] -ne 0 ] && [ $x -lt $[$num_iters-$num_iters_final+1] ]; then
        # delete all but every 10th model; don't delete the ones which combine to form the final model.
       rm $dir/$x.mdl
     fi
diff --git a/egs/wsj/s5/steps/nnet3/adjust_priors.sh b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
index 60d377f18e8..e8adb408590 100755
--- a/egs/wsj/s5/steps/nnet3/adjust_priors.sh
+++ b/egs/wsj/s5/steps/nnet3/adjust_priors.sh
@@ -2,12 +2,12 @@
 
 . path.sh
 
-# This script computes the DNN output averaged over a small subset of 
+# This script computes the DNN output averaged over a small subset of
 # training egs and stores it in post.$iter.vec.
-# This is used for the purpose of adjusting the nnet priors. 
-# When --use-raw-nnet is false, then the computed priors is added into the 
-# nnet model; hence the term adjust priors. 
-# When --use-raw-nnet is true, the computed priors is not added into the 
+# This is used for the purpose of adjusting the nnet priors.
+# When --use-raw-nnet is false, then the computed priors is added into the
+# nnet model; hence the term adjust priors.
+# When --use-raw-nnet is true, the computed priors is not added into the
 # nnet model and left in the file post.$iter.vec.
 
 cmd=run.pl
@@ -16,11 +16,12 @@ num_jobs_compute_prior=10 # these are single-threaded, run on CPU.
 use_gpu=false             # if true, we run on GPU.
 egs_type=egs              # Compute from $egs_type.*.ark in $egs_dir
                           # If --egs-type is degs, then the program
-                          # nnet3-discriminative-compute-from-egs is used 
+                          # nnet3-discriminative-compute-from-egs is used
                           # instead of nnet3-compute-from-egs.
-use_raw_nnet=false        # If raw nnet, the averaged posterior is computed 
+use_raw_nnet=false        # If raw nnet, the averaged posterior is computed
                           # and stored in post.$iter.vec; but there is no
                           # adjusting of priors
+minibatch_size=256
 iter=final
 
 . utils/parse_options.sh
@@ -44,47 +45,42 @@ else
   prior_queue_opt=""
 fi
 
-for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do 
+for f in $egs_dir/$egs_type.1.ark $egs_dir/info/num_archives; do
   if [ ! -f $f ]; then
-    echo "$f not found" 
-    exit 1 
+    echo "$f not found"
+    exit 1
   fi
 done
 
 if $use_raw_nnet; then
   model=$dir/$iter.raw
-else 
+else
   model="nnet3-am-copy --raw=true $dir/$iter.mdl - |"
 fi
 
 rm -f $dir/post.$iter.*.vec 2>/dev/null
 
-left_context=`cat $egs_dir/info/left_context` || exit 1
-right_context=`cat $egs_dir/info/right_context` || exit 1
-
-context_opts="--left-context=$left_context --right-context=$right_context"
-
 num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; }
-if [ $num_jobs_compute_prior -gt $num_archives ]; then egs_part=1;
-else egs_part=JOB; fi
+if [ $num_jobs_compute_prior -gt $num_archives ]; then
+  num_jobs_compute_prior=$num_archives
+fi
 
 if [ $egs_type != "degs" ]; then
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
-    nnet3-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-copy-egs ark:$egs_dir/$egs_type.$JOB.ark ark:- \| \
     nnet3-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-merge-egs ark:- ark:- \| \
+    nnet3-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
     nnet3-compute-from-egs $prior_gpu_opt --apply-exp=true \
     "$model" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
-else 
+else
   $cmd JOB=1:$num_jobs_compute_prior $prior_queue_opt $dir/log/get_post.$iter.JOB.log \
-    nnet3-discriminative-copy-egs ark:$egs_dir/$egs_type.$egs_part.ark ark:- \| \
+    nnet3-discriminative-copy-egs ark:$egs_dir/degs.JOB.ark ark:- \| \
     nnet3-discriminative-subset-egs --srand=JOB --n=$prior_subset_size ark:- ark:- \| \
-    nnet3-discriminative-merge-egs ark:- ark:- \| \
-    nnet3-compute-from-degs $prior_gpu_opt --apply-exp=true \
+    nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- \| \
+    nnet3-discriminative-compute-from-egs $prior_gpu_opt --apply-exp=true \
     "$model" ark:- ark:- \| \
     matrix-sum-rows ark:- ark:- \| vector-sum ark:- $dir/post.$iter.JOB.vec || exit 1;
-
 fi
 
 sleep 3;  # make sure there is time for $dir/post.$iter.*.vec to appear.
@@ -94,8 +90,7 @@ $cmd $dir/log/vector_sum.$iter.log \
 
 if ! $use_raw_nnet; then
   run.pl $dir/log/adjust_priors.$iter.log \
-    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/$iter.adj.mdl
+    nnet3-am-adjust-priors $dir/$iter.mdl $dir/post.$iter.vec $dir/${iter}_adj.mdl
 fi
 
 rm -f $dir/post.$iter.*.vec;
-
diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh
index 9befe16164f..5ecf4d5624a 100755
--- a/egs/wsj/s5/steps/nnet3/align.sh
+++ b/egs/wsj/s5/steps/nnet3/align.sh
@@ -49,8 +49,9 @@ dir=$4
 oov=`cat $lang/oov.int` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
-sdata=$data/split$nj
-[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+sdata=$data/split${nj}utt
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
+   split_data.sh --per-utt $data $nj || exit 1;
 
 if $use_gpu; then
   queue_opt="--gpu 1"
@@ -61,8 +62,11 @@ else
 fi
 
 extra_files=
-[ ! -z "$online_ivector_dir" ] && \
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
 for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
@@ -124,7 +128,6 @@ fi
 ivector_opts=
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-  # note: subsample-feats, with negative n, will repeat each feature -n times.
   ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 fi
 
@@ -135,10 +138,19 @@ tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"
 frame_subsampling_opt=
 if [ -f $srcdir/frame_subsampling_factor ]; then
   # e.g. for 'chain' systems
-  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
   cp $srcdir/frame_subsampling_factor $dir
+  if [ "$frame_subsampling_factor" -gt 1 ] && \
+     [ "$scale_opts" == "--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" ]; then
+    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
+    echo "...  but the scale opts are the defaults.  You probably want"
+    echo "--scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0'"
+    sleep 1
+  fi
 fi
 
+
 $cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
   compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" ark:- \| \
   nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \
@@ -153,4 +165,3 @@ $cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \
 steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir
 
 echo "$0: done aligning data."
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index bbff6263fe4..280ab4ee0b2 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -22,9 +22,12 @@ cmd=run.pl
 context_opts=  # e.g. set this to "--context-width 5 --central-position 2" for quinphone.
 cluster_thresh=-1  # for build-tree control final bottom-up clustering of leaves
 frame_subsampling_factor=1
-leftmost_questions_truncate=10
+leftmost_questions_truncate=-1  # note: this used to default to 10, but we never
+                                # use this option now with value != -1, and
+                                # we're changing the default
 tree_stats_opts=
 cluster_phones_opts=
+repeat_frames=false
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -39,6 +42,15 @@ if [ $# != 5 ]; then
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   echo "  --config <config-file>                           # config containing options"
   echo "  --stage <stage>                                  # stage to do partial re-run from."
+  echo "  --repeat-frames <true|false>                     # Only affects alignment conversion at"
+  echo "                                                   # the end. If true, generate an "
+  echo "                                                   # alignment using the frame-subsampled "
+  echo "                                                   # topology that is repeated "
+  echo "                                                   # --frame-subsampling-factor times "
+  echo "                                                   # and interleaved, to be the same "
+  echo "                                                   # length as the original alignment "
+  echo "                                                   # (useful for cross-entropy training "
+  echo "                                                   # of reduced frame rate systems)."
   exit 1;
 fi
 
@@ -171,12 +183,12 @@ if [ $stage -le -1 ]; then
   # for other purposes.
   echo "$0: Converting alignments from $alidir to use current tree"
   $cmd JOB=1:$nj $dir/log/convert.JOB.log \
-    convert-ali --frame-subsampling-factor=$frame_subsampling_factor \
-       $alidir/final.mdl $dir/1.mdl $dir/tree \
-     "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+    convert-ali --repeat-frames=$repeat_frames \
+      --frame-subsampling-factor=$frame_subsampling_factor \
+      $alidir/final.mdl $dir/1.mdl $dir/tree \
+      "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
 fi
 
 cp $dir/1.mdl $dir/final.mdl
 
 echo $0: Done building tree
-
diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index c7263f41698..4a61f8edaa7 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -21,13 +21,10 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=25   # number of feature frames example (not counting added context).
                    # more->less disk space and less time preparing egs, but more
-                   # I/O during training.  note: the script may reduce this if
-                   # reduce_frames_per_eg is true.
+                   # I/O during training.
 frames_overlap_per_eg=0  # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
-cut_zero_frames=-1  # if activated, activates new-style derivative weights.. i'll reorganize
-                    # this if it works well.
 frame_subsampling_factor=3 # frames-per-second of features we train on divided
                            # by frames-per-second at output of chain model
 alignment_subsampling_factor=3 # frames-per-second of input alignments divided
@@ -35,10 +32,8 @@ alignment_subsampling_factor=3 # frames-per-second of input alignments divided
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
@@ -97,8 +92,10 @@ if [ $# != 4 ]; then
   echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
   echo "  --frames-per-eg <frames;25>                      # number of supervised frames per eg on disk"
   echo "  --frames-overlap-per-eg <frames;25>              # number of supervised frames of overlap between egs"
-  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
-  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-egs-diagnostic <#frames;4000>              # Number of egs used in computing (train,valid) diagnostics"
   echo "  --num-valid-egs-combine <#frames;10000>          # Number of egss used in getting combination weights at the"
   echo "                                                   # very end."
@@ -218,12 +215,11 @@ fi
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
+  ivector_opts=""
   echo 0 >$dir/info/ivector_dim
 fi
 
@@ -271,7 +267,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
-
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 if [ -e $dir/storage ]; then
@@ -294,45 +292,47 @@ if [ $stage -le 2 ]; then
 fi
 
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress --cut-zero-frames=$cut_zero_frames"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
 
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-# don't do the overlap thing for the validation data.
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
-
-ctc_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
+chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
 [ ! -z $right_tolerance ] && \
-  ctc_supervision_all_opts="$ctc_supervision_all_opts --right-tolerance=$right_tolerance"
+  chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance"
 
 [ ! -z $left_tolerance ] && \
-  ctc_supervision_all_opts="$ctc_supervision_all_opts --left-tolerance=$left_tolerance"
+  chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
 
 if [ $stage -le 3 ]; then
   echo "$0: Getting validation and training subset examples."
   rm $dir/.error 2>/dev/null
   echo "$0: ... extracting validation and training-subset alignments."
 
+  # do the filtering just once, as lat.scp may be long.
   utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
     <$dir/lat.scp >$dir/lat_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
-    chain-get-supervision $ctc_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
+    utils/filter_scp.pl $dir/valid_uttlist $dir/lat_special.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+    chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \
       ark:- ark:- \| \
-    nnet3-chain-get-egs $valid_ivector_opt --srand=$srand \
-      $valid_egs_opts $chaindir/normalization.fst \
+    nnet3-chain-get-egs $ivector_opts --srand=$srand \
+      $egs_opts $chaindir/normalization.fst \
       "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
-    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:$dir/lat_special.scp ark:- \| \
-    chain-get-supervision $ctc_supervision_all_opts \
+    utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \
+    lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
+    chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
-    nnet3-chain-get-egs $train_subset_ivector_opt --srand=$srand \
-      $valid_egs_opts $chaindir/normalization.fst \
+    nnet3-chain-get-egs $ivector_opts --srand=$srand \
+      $egs_opts $chaindir/normalization.fst \
       "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
@@ -381,9 +381,10 @@ if [ $stage -le 4 ]; then
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
     utils/filter_scp.pl $sdata/JOB/utt2spk $dir/lat.scp \| \
     lattice-align-phones --replace-output-symbols=true $latdir/final.mdl scp:- ark:- \| \
-    chain-get-supervision $ctc_supervision_all_opts \
+    chain-get-supervision $chain_supervision_all_opts \
       $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \
-    nnet3-chain-get-egs $ivector_opt --srand=\$[JOB+$srand] $egs_opts \
+    nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
+      --num-frames-overlap=$frames_overlap_per_eg \
      "$feats" ark,s,cs:- ark:- \| \
     nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index f658d2a770f..19276817ea0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -51,11 +51,13 @@ def get_args():
         parents=[common_train_lib.CommonParser().parser])
 
     # egs extraction options
-    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default=150,
-                        help="""Number of output labels in each example.
-                        Caution: if you double this you should halve
-                        --trainer.samples-per-iter.""")
+    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
+                        default="20",
+                        help="""Number of frames per chunk in the examples
+                        used to train the RNN.   Caution: if you double this you
+                        should halve --trainer.samples-per-iter.  May be
+                        a comma-separated list of alternatives: first width
+                        is the 'principal' chunk-width, used preferentially""")
 
     # chain options
     parser.add_argument("--chain.lm-opts", type=str, dest='lm_opts',
@@ -84,11 +86,6 @@ def get_args():
                         action=common_lib.StrToBoolAction,
                         choices=["true", "false"],
                         help="")
-    parser.add_argument("--chain.truncate-deriv-weights", type=float,
-                        dest='truncate_deriv_weights', default=0,
-                        help="""Can be used to set to zero the weights of
-                        derivs from frames near the edges.  (counts subsampled
-                        frames)""")
     parser.add_argument("--chain.frame-subsampling-factor", type=int,
                         dest='frame_subsampling_factor', default=3,
                         help="ratio of frames-per-second of features we "
@@ -104,8 +101,8 @@ def get_args():
                         help="Deprecated. Kept for back compatibility")
 
     # trainer options
-    parser.add_argument("--trainer.num-epochs", type=int, dest='num_epochs',
-                        default=10,
+    parser.add_argument("--trainer.num-epochs", type=float, dest='num_epochs',
+                        default=10.0,
                         help="Number of epochs to train the model")
     parser.add_argument("--trainer.frames-per-iter", type=int,
                         dest='frames_per_iter', default=800000,
@@ -113,10 +110,14 @@ def get_args():
                         [input] frames per job.  This option is passed to
                         get_egs.sh.  Aim for about a minute of training
                         time""")
-    parser.add_argument("--trainer.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=512,
-                        help="Number of sequences to be processed in parallel "
-                        "every minibatch")
+
+    parser.add_argument("--trainer.num-chunk-per-minibatch", type=str,
+                        dest='num_chunk_per_minibatch', default='128',
+                        help="""Number of sequences to be processed in
+                        parallel every minibatch.  May be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.initial-effective-lrate",
@@ -185,8 +186,11 @@ def process_args(args):
     """ Process the options got from get_args()
     """
 
-    if args.chunk_width < 1:
-        raise Exception("--egs.chunk-width should have a minimum value of 1")
+    if not common_train_lib.validate_chunk_width(args.chunk_width):
+        raise Exception("--egs.chunk-width has an invalid value");
+
+    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
+        raise Exception("--trainer.num-chunk-per-minibatch has an invalid value");
 
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
@@ -261,6 +265,7 @@ def train(args, run_opts, background_process_handler):
     num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
@@ -287,6 +292,10 @@ def train(args, run_opts, background_process_handler):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    left_context_initial = (args.chunk_left_context_initial + model_left_context if
+                            args.chunk_left_context_initial >= 0 else -1)
+    right_context_final = (args.chunk_right_context_final + model_right_context if
+                           args.chunk_right_context_final >= 0 else -1)
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -310,8 +319,12 @@ def train(args, run_opts, background_process_handler):
                     {dir}/init.raw""".format(command=run_opts.command,
                                              dir=args.dir))
 
-    egs_left_context = left_context + args.frame_subsampling_factor/2
-    egs_right_context = right_context + args.frame_subsampling_factor/2
+    egs_left_context = left_context + args.frame_subsampling_factor / 2
+    egs_right_context = right_context + args.frame_subsampling_factor / 2
+    egs_left_context_initial = (left_context_initial + args.frame_subsampling_factor / 2 if
+                                left_context_initial >= 0 else -1)
+    egs_right_context_final = (right_context_final + args.frame_subsampling_factor / 2 if
+                               right_context_final >= 0 else -1)
 
     default_egs_dir = '{0}/egs'.format(args.dir)
     if (args.stage <= -3) and args.egs_dir is None:
@@ -322,12 +335,14 @@ def train(args, run_opts, background_process_handler):
             lat_dir=args.lat_dir, egs_dir=default_egs_dir,
             left_context=egs_left_context,
             right_context=egs_right_context,
+            left_context_initial=egs_left_context_initial,
+            right_context_final=egs_right_context_final,
             run_opts=run_opts,
             left_tolerance=args.left_tolerance,
             right_tolerance=args.right_tolerance,
             frame_subsampling_factor=args.frame_subsampling_factor,
             alignment_subsampling_factor=args.alignment_subsampling_factor,
-            frames_per_eg=args.chunk_width,
+            frames_per_eg_str=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -342,10 +357,13 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                                        egs_left_context, egs_right_context))
-    assert(args.chunk_width == frames_per_eg)
+     frames_per_eg_str, num_archives] = (
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
+                                        egs_left_context, egs_right_context,
+                                        egs_left_context_initial,
+                                        egs_right_context_final))
+    assert(args.chunk_width == frames_per_eg_str)
     num_archives_expanded = num_archives * args.frame_subsampling_factor
 
     if (args.num_jobs_final > num_archives_expanded):
@@ -354,6 +372,7 @@ def train(args, run_opts, background_process_handler):
 
     # copy the properties of the egs to dir for
     # use during decoding
+    logger.info("Copying the properties from {0} to {1}".format(egs_dir, args.dir))
     common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
     if (args.stage <= -2):
@@ -375,7 +394,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
-    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
@@ -395,11 +414,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.final_effective_lrate)
 
     min_deriv_time = None
-    max_deriv_time = None
+    max_deriv_time_relative = None
     if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
-                          + model_right_context)
+        max_deriv_time_relative = \
+           args.deriv_truncate_margin + model_right_context
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -438,14 +457,14 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
                 shrinkage_value=shrinkage_value,
-                num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+                num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
                 right_context=right_context,
                 apply_deriv_weights=args.apply_deriv_weights,
                 min_deriv_time=min_deriv_time,
-                max_deriv_time=max_deriv_time,
+                max_deriv_time_relative=max_deriv_time_relative,
                 l2_regularize=args.l2_regularize,
                 xent_regularize=args.xent_regularize,
                 leaky_hmm_coefficient=args.leaky_hmm_coefficient,
@@ -453,7 +472,6 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
                 frame_subsampling_factor=args.frame_subsampling_factor,
-                truncate_deriv_weights=args.truncate_deriv_weights,
                 run_opts=run_opts,
                 background_process_handler=background_process_handler)
 
@@ -483,14 +501,16 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
         chain_lib.combine_models(
             dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine,
-            num_chunk_per_minibatch=args.num_chunk_per_minibatch,
+            num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
             egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
             leaky_hmm_coefficient=args.leaky_hmm_coefficient,
             l2_regularize=args.l2_regularize,
             xent_regularize=args.xent_regularize,
             run_opts=run_opts,
-            background_process_handler=background_process_handler)
+            background_process_handler=background_process_handler,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
+
 
     if args.cleanup:
         logger.info("Cleaning up the experiment directory "
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index a8211c5fbc5..8f4a9cc3697 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ./train.py
+
 # note, TDNN is the same as what we used to call multisplice.
 # This version of the script, nnet3/chain/train_tdnn.sh, is for 'chain' systems.
 
@@ -18,8 +20,6 @@ num_epochs=10      # Number of epochs of training;
                    # Be careful with this: we actually go over the data
                    # num-epochs * frame-subsampling-factor times, due to
                    # using different data-shifts.
-truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
-                          # near the edges.  (counts subsampled frames).
 apply_deriv_weights=true
 initial_effective_lrate=0.0002
 final_effective_lrate=0.00002
@@ -102,6 +102,8 @@ right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on th
 
 trap 'for pid in $(jobs -pr); do kill -TERM $pid; done' INT QUIT TERM
 
+
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
@@ -125,10 +127,10 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
-  echo "  --io-opts <opts|\"-tc 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
+  echo "  --io-opts <opts|\"--max-jobs-run 10\">                      # Options given to e.g. queue.pl for jobs that do a lot of I/O."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --frames-per-iter <#frames|400000>               # Number of frames of data to process per iteration, per"
@@ -526,7 +528,7 @@ while [ $x -lt $num_iters ]; do
               $this_cache_io_opts $parallel_train_opts $deriv_time_opts \
              --max-param-change=$this_max_param_change \
             --print-interval=10 "$mdl" $dir/den.fst \
-          "ark,bg:nnet3-chain-copy-egs --truncate-deriv-weights=$truncate_deriv_weights --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
+          "ark,bg:nnet3-chain-copy-egs --frame-shift=$frame_shift ark:$egs_dir/cegs.$archive.ark ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-chain-merge-egs --minibatch-size=$this_minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 10ac29e1c59..35a02001ae7 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -23,7 +23,6 @@ ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
 num_threads=1 # if >1, will use gmm-latgen-faster-parallel
-parallel_opts=  # ignored now.
 scoring_opts=
 skip_diagnostics=false
 skip_scoring=false
@@ -56,7 +55,6 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
-  echo "  --parallel-opts <opts>                   # e.g. '--num-threads 4' if you supply --num-threads 4"
   exit 1;
 fi
 
@@ -67,8 +65,11 @@ srcdir=`dirname $dir`; # Assume model directory one level up from decoding direc
 model=$srcdir/$iter.mdl
 
 
-[ ! -z "$online_ivector_dir" ] && \
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
 
 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/decode_looped.sh b/egs/wsj/s5/steps/nnet3/decode_looped.sh
new file mode 100755
index 00000000000..8850045c9a3
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/decode_looped.sh
@@ -0,0 +1,193 @@
+#!/bin/bash
+
+# Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+
+# This is like decode.sh except it uses "looped" decoding.  This is an nnet3
+# mechanism for reusing previously computed activations when we evaluate the
+# neural net for successive chunks of data.  It is applicable to TDNNs and LSTMs
+# and similar forward-recurrent topologies, but not to backward-recurrent
+# topologies like BLSTMs.  Be careful because the script itself does not have a
+# way to figure out what kind of topology you are using.
+#
+# Also be aware that this decoding mechanism means that you have effectively
+# unlimited context within the utterance.  Unless your models were trained (at
+# least partly) on quite large chunk-sizes, e.g. 100 or more (although the
+# longer the BLSTM recurrence the larger chunk-size you'd need in training),
+# there is a possibility that this effectively infinite left-context will cause
+# a mismatch with the training condition.  Also, for recurrent topologies, you may want to make sure
+# that the --extra-left-context-initial matches the --egs.chunk-left-context-initial
+# that you trained with, .  [note: if not specified during training, it defaults to
+# the same as the regular --extra-left-context
+
+# This script does decoding with a neural-net.  If the neural net was built on
+# top of fMLLR transforms from a conventional system, you should provide the
+# --transform-dir option.
+
+# Begin configuration section.
+stage=1
+transform_dir=    # dir to find fMLLR transforms.
+nj=4 # number of decoding jobs.  If --transform-dir set, must match that number!
+acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
+                      # regular scoring script works.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+ivector_scale=1.0
+lattice_beam=8.0 # Beam we use in lattice generation.
+iter=final
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+extra_left_context_initial=0
+feat_type=
+online_ivector_dir=
+minimize=false
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
+  echo "e.g.:   steps/nnet3/decode.sh --nj 8 \\"
+  echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\"
+  echo "    exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92"
+  echo "main options (for others, see top of script file)"
+  echo "  --transform-dir <decoding-dir>           # directory of previous decoding"
+  echo "                                           # where we can find transforms for SAT systems."
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  exit 1;
+fi
+
+graphdir=$1
+data=$2
+dir=$3
+srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
+model=$srcdir/$iter.mdl
+
+
+[ ! -z "$online_ivector_dir" ] && \
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+
+for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+sdata=$data/split$nj;
+cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+## Set up features.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+  echo "$0: feature type is $feat_type"
+fi
+
+splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
+
+case $feat_type in
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && \
+    ! cmp $transform_dir/../final.mat $srcdir/final.mat && \
+    ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then
+  echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms,"
+  echo "  but you are not providing the --transform-dir option in test time."
+fi
+##
+
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+if [ "$post_decode_acwt" == 1.0 ]; then
+  lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
+else
+  lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)"
+fi
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster-looped $ivector_opts $frame_subsampling_opt \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context-initial=$extra_left_context_initial \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt "$model" \
+     $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1;
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    [ "$iter" != "final" ] && iter_opt="--iter $iter"
+    local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh
new file mode 100755
index 00000000000..91d1bb0268a
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/get_degs.sh
@@ -0,0 +1,522 @@
+#!/bin/bash
+
+# Copyright 2012-2016   Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2014-2015   Vimal Manohar
+
+# Decodes denlats and dumps egs for discriminative training, in one script
+# (avoids writing the non-compact lattices to disk, which can use a lot of disk
+# space).
+
+
+# Begin configuration section.
+cmd=run.pl
+max_copy_jobs=5  # Limit disk I/O
+
+# feature options
+feat_type=raw     # set it to 'lda' to use LDA features.
+transform_dir= # If this is a SAT system, directory for transforms
+online_ivector_dir=
+
+# example splitting and context options
+frames_per_eg=150 # number of frames of labels per example.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations; the first one (the principal num-frames) is preferred.
+frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
+                  # can be useful to avoid wasted data if you're using --left-deriv-truncate
+                  # and --right-deriv-truncate.
+looped=false       # Set to true to enable looped decoding [can
+                   # be a bit faster, for forward-recurrent models like LSTMs.]
+
+# .. these context options also affect decoding.
+extra_left_context=0    # amount of left-context per eg, past what is required by the model
+                        # (only useful for recurrent networks like LSTMs/BLSTMs)
+extra_right_context=0   # amount of right-context per eg, past what is required by the model
+                        # (only useful for backwards-recurrent networks like BLSTMs)
+extra_left_context_initial=-1    # if >= 0, the --extra-left-context to use at
+                                 # the start of utterances.  Recommend 0 if you
+                                 # used 0 for the baseline DNN training; if <0,
+                                 # defaults to same as extra_left_context
+extra_right_context_final=-1     # if >= 0, the --extra-right-context to use at
+                                 # the end of utterances.  Recommend 0 if you
+                                 # used 0 for the baseline DNN training; if <0,
+                                 # defaults to same as extra_left_context
+
+compress=true   # set this to false to disable lossy compression of features
+                # dumped with egs (e.g. if you want to see whether results are
+                # affected).
+
+num_utts_subset=80     # number of utterances in validation and training
+                       # subsets used for diagnostics.
+num_egs_subset=800     # number of egs (maximum) for the validation and training
+                       # subsets used for diagnostics.
+frames_per_iter=1000000 # each iteration of training, see this many frames
+                        # per job.  This is just a guideline; it will pick a number
+                        # that divides the number of samples in the entire data.
+cleanup=true
+
+stage=0
+nj=200
+
+# By default this script uses final.mdl in <srcdir>, this configures it.
+iter=final
+
+
+# decoding-graph option
+self_loop_scale=0.1  # for decoding graph.. should be 1.0 for chain models.
+
+# options relating to decoding.
+frames_per_chunk_decoding=150
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+min_active=200
+max_mem=20000000 # This will stop the processes getting too large.
+# This is in bytes, but not "real" bytes-- you have to multiply
+# by something like 5 or 10 to get real bytes (not sure why so large)
+num_threads=1
+
+# affects whether we invoke lattice-determinize-non-compact after decoding
+# discriminative-get-supervision.
+determinize_before_split=true
+
+
+# End configuration section.
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 5 ]; then
+  echo "Usage: $0 [opts] <data> <lang> <src-dir> <ali-dir> <degs-dir>"
+  echo " e.g.: $0 data/train data/lang exp/nnet3/tdnn_a exp/nnet3/tdnn_a_ali exp/nnet3/tdnn_a_degs"
+  echo ""
+  echo "For options, see top of script file.  Standard options:"
+  echo "  --config <config-file>                           # config file containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
+  echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
+  echo "  --stage <stage|-8>                               # Used to run a partially-completed training process from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
+  echo "                                                   # online-neural-net setup."
+  echo "  --nj <nj|200>                                    # number of jobs to submit to the queue."
+  echo "  --num-threads <n|1>                              # number of threads per decoding job"
+  exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+alidir=$4
+dir=$5
+
+
+extra_files=
+[ ! -z $online_ivector_dir ] && \
+  extra_files="$extra_files $online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp"
+[ "$feat_type" = "lda" ] && \
+  extra_files="$extra_files $srcdir/final.mat"
+[ ! -z $transform_dir ] && \
+  extra_files="$extra_files $transform_dir/trans.1 $transform_dir/num_jobs"
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree \
+      $srcdir/cmvn_opts $alidir/ali.1.gz $alidir/num_jobs $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/log $dir/info || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+
+
+
+utils/split_data.sh --per-utt $data $nj
+sdata=$data/split${nj}utt
+
+
+## Set up features.
+if [ -z "$feat_type" ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=raw; fi
+fi
+echo "$0: feature type is $feat_type"
+
+
+cmvn_opts=$(cat $srcdir/cmvn_opts) || exit 1
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+   ;;
+  lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+cp $srcdir/{splice_opts,cmvn_opts} $dir 2>/dev/null || true
+
+if [ ! -z "$transform_dir" ]; then
+  echo "$0: using transforms from $transform_dir"
+  [ ! -s $transform_dir/num_jobs ] && \
+    echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1;
+  nj_orig=$(cat $transform_dir/num_jobs)
+
+  if [ $feat_type == "raw" ]; then trans=raw_trans;
+  else trans=trans; fi
+  if [ $feat_type == "lda" ] && ! cmp $transform_dir/final.mat $srcdir/final.mat; then
+    echo "$0: LDA transforms differ between $srcdir and $transform_dir"
+    exit 1;
+  fi
+  if [ ! -f $transform_dir/$trans.1 ]; then
+    echo "$0: expected $transform_dir/$trans.1 to exist (--transform-dir option)"
+    exit 1;
+  fi
+  if [ $nj -ne $nj_orig ]; then
+    # Copy the transforms into an archive with an index.
+    for n in $(seq $nj_orig); do cat $transform_dir/$trans.$n; done | \
+       copy-feats ark:- ark,scp:$dir/$trans.ark,$dir/$trans.scp || exit 1;
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/$trans.scp ark:- ark:- |"
+  else
+    # number of jobs matches with alignment dir.
+    feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/$trans.JOB ark:- ark:- |"
+  fi
+fi
+
+
+## set iVector options
+if [ ! -z "$online_ivector_dir" ]; then
+  online_ivector_period=$(cat $online_ivector_dir/ivector_period)
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$online_ivector_period"
+fi
+
+## set frame-subsampling-factor option and copy file
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) || exit 1
+  # e.g. for 'chain' systems
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+  cp $srcdir/frame_subsampling_factor $dir
+  if [ $frame_subsampling_factor -ne 1 ] && [ "$self_loop_scale" == "0.1" ]; then
+    echo "$0: warning: frame_subsampling_factor is not 1 (so likely a chain system),"
+    echo "...  but self-loop-scale is 0.1.  Make sure this is not a mistake."
+    sleep 1
+  fi
+else
+  frame_subsampling_factor=1
+fi
+
+if [ "$self_loop_scale" == "1.0" ] && [ "$acwt" == 0.1 ]; then
+  echo "$0: warning: you set --self-loop-scale=1.0 (so likely a chain system)",
+  echo " ... but the acwt is still 0.1 (you probably want --acwt 1.0)"
+  sleep 1
+fi
+
+## Make the decoding graph.
+if [ $stage -le 0 ]; then
+  new_lang="$dir/"$(basename "$lang")
+  rm -r $new_lang 2>/dev/null
+  cp -rH $lang $dir
+  echo "$0: Making unigram grammar FST in $new_lang"
+  oov=$(cat data/lang/oov.txt)
+  cat $data/text | utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt | \
+   awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+    utils/make_unigram_grammar.pl | fstcompile | fstarcsort --sort_type=ilabel > $new_lang/G.fst \
+    || exit 1;
+
+  utils/mkgraph.sh --self-loop-scale $self_loop_scale $new_lang $srcdir $dir/dengraph || exit 1;
+fi
+
+# copy alignments into ark,scp format which allows us to use different num-jobs
+# from the alignment, and is also convenient for getting priors.
+if [ $stage -le 1 ]; then
+  echo "$0: Copying input alignments"
+  nj_ali=$(cat $alidir/num_jobs)
+  alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done)
+  $cmd $dir/log/copy_alignments.log \
+     copy-int-vector "ark:gunzip -c $alis|" \
+     ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
+fi
+
+[ -f $dir/ali.scp ] || { echo "$0: expected $dir/ali.scp to exist"; exit 1; }
+
+if [ $stage -le 2 ]; then
+  echo "$0: working out number of frames of training data"
+  num_frames=$(steps/nnet2/get_num_frames.sh $data)
+  echo $num_frames > $dir/info/num_frames
+  echo "$0: working out feature dim"
+  feats_one="$(echo $feats | sed s:JOB:1:g)"
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without stderr redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
+else
+  num_frames=$(cat $dir/info/num_frames)
+fi
+if ! [ "$num_frames" -gt 0 ]; then
+  echo "$0: bad num-frames=$num_frames"; exit 1
+fi
+
+# copy the model to the degs directory.
+cp $srcdir/${iter}.mdl $dir/final.mdl || exit 1
+
+# Create some info in $dir/info
+
+# Work out total number of archives. Add one on the assumption the
+# num-frames won't divide exactly, and we want to round up.
+num_archives=$[num_frames/frames_per_iter+1]
+
+echo $num_archives >$dir/info/num_archives
+echo $frame_subsampling_factor >$dir/info/frame_subsampling_factor
+cp $lang/phones/silence.csl $dir/info/
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
+
+# read 'mof' as max_open_filehandles.
+# When splitting up the scp files, we don't want to have to hold too many
+# files open at once.  If the number of archives we have to write exceeds
+# 256 (or less if unlimit -n is smaller), we split in two stages.
+mof=$(ulimit -n) || exit 1
+# the next step helps work around inconsistency between different machines on a
+# cluster.  It's unlikely that the allowed number of open filehandles would ever
+# be less than 256.
+if [ $mof -gt 256 ]; then mof=256; fi
+# allocate mof minus 3 for the max allowed outputs, because of
+# stdin,stderr,stdout.  this will normally come to 253.  We'll do a two-stage
+# splitting if the needed number of scp files is larger than this.
+num_groups=$[(num_archives+(mof-3)-1)/(mof-3)]
+group_size=$[(num_archives+num_groups-1)/num_groups]
+if [ $num_groups -gt 1 ]; then
+  new_num_archives=$[group_size*num_groups]
+  [ $new_num_archives -ne $num_archives ] && \
+    echo "$0: rounding up num-archives from $num_archives to $new_num_archives for easier splitting"
+  num_archives=$new_num_archives
+  echo $new_num_archives >$dir/info/num_archives
+fi
+
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.ark; done)
+  utils/create_data_link.pl $(for x in $(seq $num_archives); do echo $dir/degs.$x.scp; done)
+  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.ark; done)
+  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig.$y.scp; done)
+  utils/create_data_link.pl $(for y in $(seq $nj); do echo $dir/degs_orig_filtered.$y.scp; done)
+fi
+
+
+extra_context_opts="--extra-left-context=$extra_left_context --extra-right-context=$extra_right_context --extra-left-context-initial=$extra_left_context_initial --extra-right-context-final=$extra_right_context_final"
+
+# work out absolute context opts, --left-context and so on [need model context]
+model_left_context=$(nnet3-am-info $srcdir/${iter}.mdl | grep "^left-context:" | awk '{print $2}')
+model_right_context=$(nnet3-am-info $srcdir/${iter}.mdl | grep "^right-context:" | awk '{print $2}')
+left_context=$[model_left_context+extra_left_context+frame_subsampling_factor/2]
+right_context=$[model_right_context+extra_right_context+frame_subsampling_factor/2]
+context_opts="--left-context=$left_context --right-context=$right_context"
+if [ $extra_left_context_initial -ge 0 ]; then
+  left_context_initial=$[model_left_context+extra_left_context_initial+frame_subsampling_factor/2]
+  context_opts="$context_opts --left-context-initial=$left_context_initial"
+fi
+if [ $extra_right_context_final -ge 0 ]; then
+  right_context_final=$[model_right_context+extra_right_context_final+frame_subsampling_factor/2]
+  context_opts="$context_opts --right-context-final=$right_context_final"
+fi
+
+##
+if [ $num_threads -eq 1 ]; then
+  if $looped; then
+    decoder="nnet3-latgen-faster-looped"
+    [ $extra_left_context_initial -ge 0 ] && \
+      decoder="$decoder --extra-left-context-initial=$extra_left_context_initial"
+  else
+    decoder="nnet3-latgen-faster $extra_context_opts"
+  fi
+  threads_cmd_opt=
+else
+  $looped && { echo "$0: --num-threads must be one if you use looped decoding"; exit 1; }
+  threads_cmd_opt="--num-threads $num_threads"
+  decoder="nnet3-latgen-faster-parallel --num-threads=$num_threads $extra_context_opts"
+  true
+fi
+
+# set the command to determinize lattices, if specified.
+if $determinize_before_split; then
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=true --prune=true --beam=$lattice_beam ark:- ark:-"
+else
+  lattice_determinize_cmd="cat"
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: decoding and dumping egs"
+  $cmd $threads_cmd_opt JOB=1:$nj $dir/log/decode_and_get_egs.JOB.log \
+     $decoder \
+     $ivector_opts $frame_subsampling_opt \
+    --frames-per-chunk=$frames_per_chunk_decoding \
+    --determinize-lattice=false \
+    --max-active=$max_active --min-active=$min_active --beam=$beam \
+    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=false \
+    --word-symbol-table=$lang/words.txt $dir/final.mdl  \
+    $dir/dengraph/HCLG.fst "$feats" ark:- \| \
+    $lattice_determinize_cmd  \| \
+    nnet3-discriminative-get-egs --acoustic-scale=$acwt --compress=$compress \
+      $frame_subsampling_opt --num-frames=$frames_per_eg \
+      --num-frames-overlap=$frames_overlap_per_eg \
+      $ivector_opts $context_opts \
+      $dir/final.mdl "$feats"  "ark,s,cs:-" \
+      "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
+      ark,scp:$dir/degs_orig.JOB.ark,$dir/degs_orig.JOB.scp || exit 1
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: getting validation utterances."
+
+  ## Get list of validation utterances.
+  awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+   > $dir/valid_uttlist || exit 1;
+
+  if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
+    echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
+    echo "include all perturbed versions of the same 'real' utterances."
+    mv $dir/valid_uttlist $dir/valid_uttlist.tmp
+    utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
+    cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+      sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
+      awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $dir/valid_uttlist
+    rm $dir/uniq2utt $dir/valid_uttlist.tmp
+  fi
+
+  # the following awk statement turns 'foo123' into something like
+  # '^foo123-[0-9]\+ ' which is a grep expression that matches the lines in the
+  # .scp file that correspond to an utterance in valid_uttlist.
+  cat $dir/valid_uttlist | awk '{printf("^%s-[0-9]\\+ \n", $1);}' \
+     >$dir/valid_uttlist.regexps || exit 1
+
+  # remove the validation utterances from deg_orig.*.scp to produce
+  # degs_orig_filtered.*.scp.
+  # note: the '||' true is in case the grep returns nonzero status for
+  # some splits, because they were all validation utterances.
+  $cmd JOB=1:$nj $dir/log/filter_and_shuffle.JOB.log \
+     grep -v -f $dir/valid_uttlist.regexps $dir/degs_orig.JOB.scp '>' \
+     $dir/degs_orig_filtered.JOB.scp '||' true || exit 1
+
+  # extract just the validation utterances from deg_orig.*.scp to produce
+  # degs_valid.*.scp.
+  $cmd JOB=1:$nj $dir/log/extract_validation_egs.JOB.log \
+    grep -f $dir/valid_uttlist.regexps $dir/degs_orig.JOB.scp '>' \
+    $dir/degs_valid.JOB.scp '||' true || exit 1
+
+  for j in $(seq $nj); do
+    cat $dir/degs_valid.$j.scp; rm $dir/degs_valid.$j.scp;
+  done | utils/shuffle_list.pl | head -n$num_utts_subset >$dir/valid_diagnostic.scp || exit 1
+
+  [ -s $dir/valid_diagnostic.scp ] || { echo "$0: error getting validation egs"; exit 1; }
+fi
+
+
+
+# function/pseudo-command to randomly shuffle input lines using a small buffer size
+function shuffle {
+    perl -e ' use List::Util qw(shuffle); srand(0);
+       $bufsz=1000; @A = (); while(<STDIN>) { push @A, $_; if (@A == $bufsz) {
+       $n=int(rand()*$bufsz); print $A[$n]; $A[$n] = $A[$bufsz-1]; pop @A; }}
+       @A = shuffle(@A); print @A; '
+}
+# funtion/pseudo-command to put input lines round robin to command line args.
+function round_robin {
+  perl -e '@F=(); foreach $a (@ARGV) { my $f; open($f, ">$a") || die "opening file $a"; push @F, $f; }
+         $N=@F; $N>0||die "No output files"; $n=0;
+         while (<STDIN>) { $fh=$F[$n%$N]; $n++; print $fh $_ || die "error printing"; } ' $*
+}
+
+
+if [ $stage -le 5 ]; then
+  echo "$0: rearranging scp files"
+
+  if [ $num_groups -eq 1 ]; then
+    # output directly to the archive files.
+    outputs=$(for n in $(seq $num_archives); do echo $dir/degs.$n.scp; done)
+  else
+    # output to intermediate 'group' files.
+    outputs=$(for g in $(seq $num_groups); do echo $dir/degs_group.$g.scp; done)
+  fi
+
+  # We can't use UNIX's split command because of compatibility issues (BSD
+  # version very different from GNU version), so we use 'round_robin' which is
+  # a bash function that calls an inline perl script.
+  for j in $(seq $nj); do cat $dir/degs_orig_filtered.$j.scp; done | \
+    shuffle | round_robin $outputs || exit 1
+
+  if [ $num_groups -gt 1 ]; then
+    for g in $(seq $num_groups); do
+      first=$[1+group_size*(g-1)]
+      last=$[group_size*g]
+      outputs=$(for n in $(seq $first $last); do echo $dir/degs.$n.scp; done)
+      cat $dir/degs_group.$g.scp | shuffle | round_robin $outputs
+    done
+  fi
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: getting train-subset scp"
+  # get degs_train_subset.scp by taking the top and tail of the degs files [quicker
+  # than cat'ing all the files, random shuffling and head]
+
+  nl=$[$num_egs_subset/$num_archives + 1]
+
+  # use utils/shuffle_list.pl because it provides a complete shuffle (ok since
+  # the amount of data is small).  note: shuf is not available on mac by
+  # default.
+  for n in $(seq $num_archives); do
+    head -n$nl $dir/degs.$n.scp;  tail -n$nl $dir/degs.$n.scp
+  done  | utils/shuffle_list.pl | head -n$num_utts_subset >$dir/train_diagnostic.scp
+  [ -s $dir/train_diagnostic.scp ] || { echo "$0: error getting train_diagnostic.scp"; exit 1; }
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating final archives"
+  $cmd --max-jobs-run "$max_copy_jobs" \
+     JOB=1:$num_archives $dir/log/copy_archives.JOB.log \
+     nnet3-discriminative-copy-egs scp:$dir/degs.JOB.scp ark:$dir/degs.JOB.ark || exit 1
+
+  run.pl $dir/log/copy_train_subset.log \
+      nnet3-discriminative-copy-egs scp:$dir/train_diagnostic.scp \
+         ark:$dir/train_diagnostic.degs  || exit 1
+
+  run.pl $dir/log/copy_valid_subset.log \
+      nnet3-discriminative-copy-egs scp:$dir/valid_diagnostic.scp \
+         ark:$dir/valid_diagnostic.degs  || exit 1
+fi
+
+if [ $stage -le 10 ] && $cleanup; then
+  echo "$0: cleaning up temporary files."
+  for j in $(seq $nj); do
+    for f in $dir/degs_orig.$j.{ark,scp} $dir/degs_orig_filtered.$j.scp; do
+      [ -L $f ] && rm $(readlink -f $f); rm $f
+    done
+  done
+  rm $dir/degs_group.*.scp $dir/valid_diagnostic.scp $dir/train_diagnostic.scp 2>/dev/null
+  rm $dir/ali.ark $dir/ali.scp 2>/dev/null
+  for n in $(seq $num_archives); do
+    for f in $dir/degs.$n.scp; do
+      [ -L $f ] && rm $(readlink -f $f); rm $f
+    done
+  done
+fi
+
+
+exit 0
+
+
+echo "$0: Finished decoding and preparing training examples"
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index 79bfc25fff6..d72a3d23fe5 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 #
 # This script, which will generally be called from other neural-net training
 # scripts, extracts the training examples used to train the neural net (and also
@@ -18,27 +18,22 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations (more useful when using large chunks, e.g. for BLSTMs);
+                  # the first one (the principal num-frames) is preferred.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 
-reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
-                           # if there is only one archive and even with the
-                           # reduced frames_per_eg, the number of
-                           # samples_per_iter that would result is less than or
-                           # equal to the user-specified value.
 num_utts_subset=300     # number of utterances in validation and training
                         # subsets used for shrinkage and diagnostics.
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
-num_train_frames_combine=10000 # # train frames for the above.
-num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+num_train_frames_combine=60000 # # train frames for the above.
+num_frames_diagnostic=10000 # number of frames for "compute_prob" jobs
 samples_per_iter=400000 # this is the target number of egs in each archive of egs
                         # (prior to merging egs).  We probably should have called
                         # it egs_per_iter. This is just a guideline; it will pick
@@ -76,8 +71,15 @@ if [ $# != 3 ]; then
   echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
   echo "                                                   # to use as input to the neural net."
   echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
-  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
-  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "                                                   # May be either a single number or a comma-separated list"
+  echo "                                                   # of alternatives (useful when training LSTMs, where the"
+  echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
+  echo "                                                   # sizes).  The first in the list is preferred and is used"
+  echo "                                                   # when working out the number of archives etc."
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
   echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
   echo "                                                   # very end."
@@ -186,12 +188,11 @@ fi
 if [ ! -z "$online_ivector_dir" ]; then
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim > $dir/info/ivector_dim
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
+  ivector_opts=""
   echo 0 >$dir/info/ivector_dim
 fi
 
@@ -201,26 +202,29 @@ if [ $stage -le 1 ]; then
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
   feats_one="$(echo $feats | sed s/JOB/1/g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
-  echo $feat_dim > $dir/info/feat_dim
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
 else
   num_frames=$(cat $dir/info/num_frames) || exit 1;
   feat_dim=$(cat $dir/info/feat_dim) || exit 1;
 fi
 
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
 # the + 1 is to round up, not down... we assume it doesn't divide exactly.
-num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
-# (for small data)- while reduce_frames_per_eg == true and the number of
-# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
-# by 1.
-reduced=false
-while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
-  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
-  frames_per_eg=$[$frames_per_eg-1]
-  num_archives=1
-  reduced=true
-done
-$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
+num_archives=$[$num_frames/($frames_per_eg_principal*$samples_per_iter)+1]
+if [ $num_archives -eq 1 ]; then
+  echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
+  echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
+  sleep 4
+fi
 
 # We may have to first create a smaller number of larger archives, with number
 # $num_archives_intermediate, if $num_archives is more than the maximum number
@@ -238,7 +242,7 @@ num_archives=$[$archives_multiple*$num_archives_intermediate]
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
 # Work out the number of egs per archive
-egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)]
 ! [ $egs_per_archive -le $samples_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
   && exit 1;
@@ -247,6 +251,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 
@@ -266,46 +273,54 @@ if [ $stage -le 2 ]; then
     copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
 fi
 
-egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
+
+
 num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}')
 if [ $stage -le 3 ]; then
   echo "$0: Getting validation and training subset examples."
   rm $dir/.error 2>/dev/null
   echo "$0: ... extracting validation and training-subset alignments."
 
+
+  # do the filtering just once, as ali.scp may be long.
   utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
     <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $valid_egs_opts "$valid_feats" \
-    "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
-    "ark:$dir/valid_all.egs" || touch $dir/.error &
+    utils/filter_scp.pl $dir/valid_uttlist $dir/ali_special.scp \| \
+    ali-to-pdf $alidir/final.mdl scp:- ark:- \| \
+    ali-to-post ark:- ark:- \| \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$valid_feats" \
+      ark,s,cs:- "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
-     "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
+    utils/filter_scp.pl $dir/train_subset_uttlist $dir/ali_special.scp \| \
+    ali-to-pdf $alidir/final.mdl scp:- ark:- \| \
+    ali-to-post ark:- ark:- \| \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$train_subset_feats" \
+      ark,s,cs:- "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_diagnostic.egs || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
@@ -328,7 +343,7 @@ if [ $stage -le 4 ]; then
   echo "$0: Generating training examples on disk"
   # The examples will go round-robin to egs_list.
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \
+    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $egs_opts "$feats" \
     "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
     nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
index c3baa5dbbc8..6fb9294e84c 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh
@@ -1,8 +1,11 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2012-2016   Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 # Copyright 2014-2015   Vimal Manohar
 
+# Note: you may find it more convenient to use the newer script get_degs.sh, which
+# combines decoding and example-creation in one step without writing lattices.
+
 # This script dumps examples MPE or MMI or state-level minimum bayes risk (sMBR)
 # training of neural nets.
 # Criterion supported are mpe, smbr and mmi
@@ -12,7 +15,8 @@ cmd=run.pl
 feat_type=raw     # set it to 'lda' to use LDA features.
 frames_per_eg=150 # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations; the first one (the principal num-frames) is preferred.
 frames_overlap_per_eg=30 # number of supervised frames of overlap that we aim for per eg.
                   # can be useful to avoid wasted data if you're using --left-deriv-truncate
                   # and --right-deriv-truncate.
@@ -21,13 +25,9 @@ frame_subsampling_factor=1 # ratio between input and output frame-rate of nnet.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 adjust_priors=true
-priors_left_context=   # amount of left_context for priors egs
-priors_right_context=   # amount of right_context for priors egs
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
 num_utts_subset=80     # number of utterances in validation and training
@@ -37,11 +37,6 @@ frames_per_iter=400000 # each iteration of training, see this many frames
                        # per job.  This is just a guideline; it will pick a number
                        # that divides the number of samples in the entire data.
 
-determinize=true
-minimize=true
-remove_output_symbols=true
-remove_epsilons=true
-collapse_transition_ids=true
 acwt=0.1
 
 stage=0
@@ -54,7 +49,9 @@ cmvn_opts=  # can be used for specifying CMVN options, if feature type is not ld
             # it doesn't make sense to use different options than were used as input to the
             # LDA transform).  This is used to turn off CMVN in the online-nnet experiments.
 
-num_priors_subset=100
+num_priors_subset=1000  #  number of utterances used to calibrate the per-state
+                        #  priors.  Note: these don't have to be held out from
+                        #  the training data.
 num_archives_priors=10
 
 # End configuration section.
@@ -72,7 +69,7 @@ if [ $# != 6 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
@@ -80,6 +77,10 @@ if [ $# != 6 ]; then
   echo "                                                   # the middle."
   echo "  --online-ivector-dir <dir|"">                    # Directory for online-estimated iVectors, used in the"
   echo "                                                   # online-neural-net setup."
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   exit 1;
 fi
 
@@ -206,11 +207,10 @@ if [ ! -z $online_ivector_dir ]; then
   ivector_period=$(cat $online_ivector_dir/ivector_period)
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
   echo $ivector_dim >$dir/info/ivector_dim
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  priors_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+else
+  ivector_opts=""
 fi
 
 if [ $stage -le 2 ]; then
@@ -218,15 +218,15 @@ if [ $stage -le 2 ]; then
   num_frames=$(steps/nnet2/get_num_frames.sh $data)
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
-  feats_one="$(echo $feats | sed s/JOB/1/g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
-  echo $feat_dim > $dir/info/feat_dim
-else
-  num_frames=$(cat $dir/info/num_frames) || exit 1;
-  feat_dim=$(cat $dir/info/feat_dim) || exit 1;
+  feats_one="$(echo $feats | sed s:JOB:1:g)"
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without stderr redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
 fi
 
-# Working out total number of archives. Add one on the assumption the
+# Work out total number of archives. Add one on the assumption the
 # num-frames won't divide exactly, and we want to round up.
 num_archives=$[$num_frames/$frames_per_iter+1]
 
@@ -245,8 +245,14 @@ num_archives=$[$archives_multiple*$num_archives_intermediate] || exit 1;
 
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
 # Work out the number of egs per archive
-egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)] || exit 1;
+egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)] || exit 1;
 ! [ $egs_per_archive -le $frames_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= frames_per_iter=$frames_per_iter" \
   && exit 1;
@@ -255,6 +261,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 if [ -e $dir/storage ]; then
@@ -277,38 +286,35 @@ if [ $stage -le 3 ]; then
   for id in $(seq $nj); do cat $dir/lat_special.$id.scp; done > $dir/lat_special.scp
 fi
 
-splitter_opts="--supervision-splitter.determinize=$determinize --supervision-splitter.minimize=$minimize --supervision-splitter.remove_output_symbols=$remove_output_symbols --supervision-splitter.remove_epsilons=$remove_epsilons --supervision-splitter.collapse-transition-ids=$collapse_transition_ids --supervision-splitter.acoustic-scale=$acwt"
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
 
-[ -z $priors_left_context ] &&  priors_left_context=$left_context;
-[ -z $priors_right_context ] &&  priors_right_context=$right_context;
 
+# If frame_subsampling_factor > 0, we will later be shifting the egs slightly to
+# the left or right as part of training, so we see (e.g.) all shifts of the data
+# modulo 3... we need to extend the l/r context slightly to account for this, to
+# ensure we see the entire context that the model requires.
 left_context=$[left_context+frame_subsampling_factor/2]
 right_context=$[right_context+frame_subsampling_factor/2]
+[ $left_context_initial -ge 0 ] && left_context_initial=$[left_context_initial+frame_subsampling_factor/2]
+[ $right_context_final -ge 0 ] && right_context_final=$[right_context_final+frame_subsampling_factor/2]
 
-egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --num-frames-overlap=$frames_overlap_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
+egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress --frame-subsampling-factor=$frame_subsampling_factor --acoustic-scale=$acwt"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
-valid_left_context=$[valid_left_context+frame_subsampling_factor/2]
-valid_right_context=$[valid_right_context+frame_subsampling_factor/2]
 
-# don't do the overlap thing for the validation data.
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --num-frames=$frames_per_eg --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress $splitter_opts"
+# don't do the overlap thing for the priors computation data-- but do use the
+# same num-frames for the eg, which would be much more efficient in case it's a
+# recurrent model and has a lot of frames of context.  In any case we're not
+# doing SGD so there is no benefit in having short chunks.
+priors_egs_opts="--left-context=$left_context --right-context=$right_context --num-frames=$frames_per_eg --compress=$compress"
+[ $left_context_initial -ge 0 ] && priors_egs_opts="$priors_egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && priors_egs_opts="$priors_egs_opts --right-context-final=$right_context_final"
 
-priors_left_context=$[priors_left_context+frame_subsampling_factor/2]
-priors_right_context=$[priors_right_context+frame_subsampling_factor/2]
-
-# don't do the overlap thing for the priors computation data.
-priors_egs_opts="--left-context=$priors_left_context --right-context=$priors_right_context --num-frames=1 --compress=$compress"
-
-supervision_all_opts="--frame-subsampling-factor=$frame_subsampling_factor"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
-
-echo $priors_left_context > $dir/info/priors_left_context
-echo $priors_right_context > $dir/info/priors_right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
 
 echo $frame_subsampling_factor > $dir/info/frame_subsampling_factor
 
@@ -341,7 +347,7 @@ fi
     num_pdfs=`am-info $alidir/final.mdl | grep pdfs | awk '{print $NF}' 2>/dev/null` || exit 1
 
     $cmd $dir/log/create_priors_subset.log \
-      nnet3-get-egs --num-pdfs=$num_pdfs $priors_ivector_opt $priors_egs_opts "$priors_feats" \
+      nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opts $priors_egs_opts "$priors_feats" \
       "$prior_ali_rspecifier ali-to-post ark:- ark:- |" \
       ark:- \| nnet3-copy-egs ark:- $priors_egs_list || \
       { touch $dir/.error; echo "Error in creating priors subset. See $dir/log/create_priors_subset.log"; exit 1; }
@@ -366,16 +372,14 @@ if [ $stage -le 4 ]; then
     <$dir/ali.scp >$dir/ali_special.scp
 
   $cmd $dir/log/create_valid_subset.log \
-    discriminative-get-supervision $supervision_all_opts \
-    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
-    nnet3-discriminative-get-egs $valid_ivector_opt $valid_egs_opts \
-    $dir/final.mdl "$valid_feats" ark,s,cs:- "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
+    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
+    $dir/final.mdl "$valid_feats" scp:$dir/lat_special.scp \
+    scp:$dir/ali_special.scp "ark:$dir/valid_diagnostic.degs" || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset.log \
-    discriminative-get-supervision $supervision_all_opts \
-    scp:$dir/ali_special.scp scp:$dir/lat_special.scp ark:- \| \
-    nnet3-discriminative-get-egs $train_subset_ivector_opt $egs_opts \
-    $dir/final.mdl "$train_subset_feats" ark,s,cs:- "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
+    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
+    $dir/final.mdl "$train_subset_feats" scp:$dir/lat_special.scp \
+    scp:$dir/ali_special.scp  "ark:$dir/train_diagnostic.degs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
@@ -401,11 +405,10 @@ if [ $stage -le 5 ]; then
   # files is the product of 'nj' by 'num_archives_intermediate', which might be
   # quite large.
   $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
-    discriminative-get-supervision $supervision_all_opts \
-    "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" \
-    "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" ark:- \| \
-    nnet3-discriminative-get-egs $ivector_opt $egs_opts \
-    $dir/final.mdl "$feats" ark,s,cs:- ark:- \| \
+    nnet3-discriminative-get-egs $ivector_opts $egs_opts \
+      --num-frames-overlap=$frames_overlap_per_eg \
+      $dir/final.mdl "$feats" "ark,s,cs:gunzip -c $denlatdir/lat.JOB.gz |" \
+      "scp:utils/filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp |" ark:- \| \
     nnet3-discriminative-copy-egs --random=true --srand=JOB ark:- $degs_list || exit 1;
 fi
 
diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
index 309c89cf99d..b8fcbfd51fa 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 
-# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  
+# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).
 #           2015-2016 Vimal Manohar
 # Apache 2.0.
 
 # This script is similar to steps/nnet3/get_egs.sh but used
-# when getting general targets (not from alignment directory) for raw nnet 
+# when getting general targets (not from alignment directory) for raw nnet
 #
 # This script, which will generally be called from other neural-net training
 # scripts, extracts the training examples used to train the neural net (and also
@@ -21,39 +21,33 @@
 # Begin configuration section.
 cmd=run.pl
 feat_type=raw       # set it to 'lda' to use LDA features.
-target_type=sparse  # dense to have dense targets, 
+target_type=sparse  # dense to have dense targets,
                     # sparse to have posteriors targets
 num_targets=        # required for target-type=sparse with raw nnet
 frames_per_eg=8   # number of frames of labels per example.  more->less disk space and
                   # less time preparing egs, but more I/O during training.
-                  # note: the script may reduce this if reduce_frames_per_eg is true.
+                  # Note: may in general be a comma-separated string of alternative
+                  # durations (more useful when using large chunks, e.g. for BLSTMs);
+                  # the first one (the principal num-frames) is preferred.
 left_context=4    # amount of left-context per eg (i.e. extra frames of input features
                   # not present in the output supervision).
 right_context=4   # amount of right-context per eg.
-valid_left_context=   # amount of left_context for validation egs, typically used in
-                      # recurrent architectures to ensure matched condition with
-                      # training egs
-valid_right_context=  # amount of right_context for validation egs
+left_context_initial=-1    # if >=0, left-context for first chunk of an utterance
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance
 compress=true   # set this to false to disable compression (e.g. if you want to see whether
                 # results are affected).
-
-reduce_frames_per_eg=true  # If true, this script may reduce the frames_per_eg
-                           # if there is only one archive and even with the
-                           # reduced frames_per_eg, the number of
-                           # samples_per_iter that would result is less than or
-                           # equal to the user-specified value.
 num_utts_subset=300     # number of utterances in validation and training
                         # subsets used for shrinkage and diagnostics.
 num_valid_frames_combine=0 # #valid frames for combination weights at the very end.
-num_train_frames_combine=10000 # # train frames for the above.
-num_frames_diagnostic=4000 # number of frames for "compute_prob" jobs
+num_train_frames_combine=60000 # # train frames for the above.
+num_frames_diagnostic=10000 # number of frames for "compute_prob" jobs
 samples_per_iter=400000 # this is the target number of egs in each archive of egs
                         # (prior to merging egs).  We probably should have called
                         # it egs_per_iter. This is just a guideline; it will pick
                         # a number that divides the number of samples in the
                         # entire data.
 
-transform_dir=     
+transform_dir=
 
 stage=0
 nj=6         # This should be set to the maximum number of jobs you are
@@ -84,8 +78,15 @@ if [ $# != 3 ]; then
   echo "  --feat-type <lda|raw>                            # (raw is the default).  The feature type you want"
   echo "                                                   # to use as input to the neural net."
   echo "  --frames-per-eg <frames;8>                       # number of frames per eg on disk"
-  echo "  --left-context <width;4>                         # Number of frames on left side to append for feature input"
-  echo "  --right-context <width;4>                        # Number of frames on right side to append for feature input"
+  echo "                                                   # May be either a single number or a comma-separated list"
+  echo "                                                   # of alternatives (useful when training LSTMs, where the"
+  echo "                                                   # frames-per-eg is the chunk size, to get variety of chunk"
+  echo "                                                   # sizes).  The first in the list is preferred and is used"
+  echo "                                                   # when working out the number of archives etc."
+  echo "  --left-context <int;4>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;4>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # If >= 0, left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # If >= 0, right-context for last chunk of an utterance"
   echo "  --num-frames-diagnostic <#frames;4000>           # Number of frames used in computing (train,valid) diagnostics"
   echo "  --num-valid-frames-combine <#frames;10000>       # Number of frames used in getting combination weights at the"
   echo "                                                   # very end."
@@ -178,14 +179,13 @@ if [ -f $dir/trans.scp ]; then
 fi
 
 if [ ! -z "$online_ivector_dir" ]; then
-  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1
+  ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1
   echo $ivector_dim > $dir/info/ivector_dim
   ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
-
-  ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  valid_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
-  train_subset_ivector_opt="--ivectors='ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $online_ivector_dir/ivector_online.scp | subsample-feats --n=-$ivector_period scp:- ark:- |'"
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
 else
+  ivector_opts=""
   echo 0 >$dir/info/ivector_dim
 fi
 
@@ -195,26 +195,29 @@ if [ $stage -le 1 ]; then
   echo $num_frames > $dir/info/num_frames
   echo "$0: working out feature dim"
   feats_one="$(echo $feats | sed s:JOB:1:g)"
-  feat_dim=$(feat-to-dim "$feats_one" -) || exit 1;
-  echo $feat_dim > $dir/info/feat_dim
+  if feat_dim=$(feat-to-dim "$feats_one" - 2>/dev/null); then
+    echo $feat_dim > $dir/info/feat_dim
+  else # run without stderr redirection to show the error.
+    feat-to-dim "$feats_one" -; exit 1
+  fi
 else
   num_frames=$(cat $dir/info/num_frames) || exit 1;
   feat_dim=$(cat $dir/info/feat_dim) || exit 1;
 fi
 
+
+# the first field in frames_per_eg (which is a comma-separated list of numbers)
+# is the 'principal' frames-per-eg, and for purposes of working out the number
+# of archives we assume that this will be the average number of frames per eg.
+frames_per_eg_principal=$(echo $frames_per_eg | cut -d, -f1)
+
 # the + 1 is to round up, not down... we assume it doesn't divide exactly.
-num_archives=$[$num_frames/($frames_per_eg*$samples_per_iter)+1]
-# (for small data)- while reduce_frames_per_eg == true and the number of
-# archives is 1 and would still be 1 if we reduced frames_per_eg by 1, reduce it
-# by 1.
-reduced=false
-while $reduce_frames_per_eg && [ $frames_per_eg -gt 1 ] && \
-  [ $[$num_frames/(($frames_per_eg-1)*$samples_per_iter)] -eq 0 ]; do
-  frames_per_eg=$[$frames_per_eg-1]
-  num_archives=1
-  reduced=true
-done
-$reduced && echo "$0: reduced frames_per_eg to $frames_per_eg because amount of data is small."
+num_archives=$[$num_frames/($frames_per_eg_principal*$samples_per_iter)+1]
+if [ $num_archives -eq 1 ]; then
+  echo "*** $0: warning: the --frames-per-eg is too large to generate one archive with"
+  echo "*** as many as --samples-per-iter egs in it.  Consider reducing --frames-per-eg."
+  sleep 4
+fi
 
 # We may have to first create a smaller number of larger archives, with number
 # $num_archives_intermediate, if $num_archives is more than the maximum number
@@ -232,7 +235,7 @@ num_archives=$[$archives_multiple*$num_archives_intermediate]
 echo $num_archives >$dir/info/num_archives
 echo $frames_per_eg >$dir/info/frames_per_eg
 # Work out the number of egs per archive
-egs_per_archive=$[$num_frames/($frames_per_eg*$num_archives)]
+egs_per_archive=$[$num_frames/($frames_per_eg_principal*$num_archives)]
 ! [ $egs_per_archive -le $samples_per_iter ] && \
   echo "$0: script error: egs_per_archive=$egs_per_archive not <= samples_per_iter=$samples_per_iter" \
   && exit 1;
@@ -241,6 +244,9 @@ echo $egs_per_archive > $dir/info/egs_per_archive
 
 echo "$0: creating $num_archives archives, each with $egs_per_archive egs, with"
 echo "$0:   $frames_per_eg labels per example, and (left,right) context = ($left_context,$right_context)"
+if [ $left_context_initial -ge 0 ] || [ $right_context_final -ge 0 ]; then
+  echo "$0:   ... and (left-context-initial,right-context-final) = ($left_context_initial,$right_context_final)"
+fi
 
 
 
@@ -254,14 +260,14 @@ if [ -e $dir/storage ]; then
   done
 fi
 
-egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
+egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress --num-frames=$frames_per_eg"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
 
 echo $left_context > $dir/info/left_context
 echo $right_context > $dir/info/right_context
+echo $left_context_initial > $dir/info/left_context_initial
+echo $right_context_final > $dir/info/right_context_final
 
 for n in `seq $nj`; do
   utils/filter_scp.pl $sdata/$n/utt2spk $targets_scp > $dir/targets.$n.scp
@@ -274,12 +280,12 @@ if [ $target_type == "dense" ]; then
 fi
 
 if [ -z "$num_targets" ]; then
-  echo "$0: num-targets is not set" 
+  echo "$0: num-targets is not set"
   exit 1
 fi
 
 case $target_type in
-  "dense") 
+  "dense")
     get_egs_program="nnet3-get-egs-dense-targets --num-targets=$num_targets"
 
     targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | copy-feats scp:- ark:- |"
@@ -289,7 +295,7 @@ case $target_type in
   "sparse")
     get_egs_program="nnet3-get-egs --num-pdfs=$num_targets"
     targets="ark:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |"
-    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" 
+    valid_targets="ark:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |"
     train_subset_targets="ark:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |"
     ;;
   default)
@@ -302,29 +308,29 @@ if [ $stage -le 3 ]; then
   rm -f $dir/.error 2>/dev/null
   $cmd $dir/log/create_valid_subset.log \
     $get_egs_program \
-    $valid_ivector_opt $valid_egs_opts "$valid_feats" \
+    $ivector_opts $egs_opts "$valid_feats" \
     "$valid_targets" \
     "ark:$dir/valid_all.egs" || touch $dir/.error &
   $cmd $dir/log/create_train_subset.log \
     $get_egs_program \
-    $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
+    $ivector_opts $egs_opts "$train_subset_feats" \
     "$train_subset_targets" \
     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
   wait;
   [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
   echo "... Getting subsets of validation examples for diagnostics and combination."
   $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_valid_frames_combine/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/valid_all.egs \
     ark:$dir/valid_diagnostic.egs || touch $dir/.error &
 
   $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_train_frames_combine/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_combine.egs || touch $dir/.error &
   $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
+    nnet3-subset-egs --n=$[$num_frames_diagnostic/$frames_per_eg_principal] ark:$dir/train_subset_all.egs \
     ark:$dir/train_diagnostic.egs || touch $dir/.error &
   wait
   sleep 5  # wait for file system to sync.
@@ -348,7 +354,7 @@ if [ $stage -le 4 ]; then
   # The examples will go round-robin to egs_list.
   $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
     $get_egs_program \
-    $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" "$targets" \
+    $ivector_opts $egs_opts "$feats" "$targets" \
     ark:- \| \
     nnet3-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1;
 fi
@@ -407,4 +413,3 @@ if [ $stage -le 6 ]; then
 fi
 
 echo "$0: Finished preparing training examples"
-
diff --git a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
index 205b6034fad..b80a8d4045b 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/lstm/make_configs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ../xconfig_to_configs.py
+
 from __future__ import print_function
 import os
 import argparse
diff --git a/egs/wsj/s5/steps/nnet3/lstm/train.sh b/egs/wsj/s5/steps/nnet3/lstm/train.sh
index 5be69aacff0..9ee0446703c 100755
--- a/egs/wsj/s5/steps/nnet3/lstm/train.sh
+++ b/egs/wsj/s5/steps/nnet3/lstm/train.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ../train_rnn.py
+
 # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
 #           2013  Xiaohui Zhang
 #           2013  Guoguo Chen
@@ -116,6 +118,7 @@ rand_prune=4.0 # speeds up LDA.
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
@@ -140,9 +143,9 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --splice-indexes <string|\"-2,-1,0,1,2 0 0\"> "
   echo "                                                   # Frame indices used for each splice layer."
   echo "                                                   # Format : <frame_indices> .... <frame_indices> "
@@ -298,8 +301,6 @@ if [ $stage -le -4 ] && [ -z "$egs_dir" ]; then
   extra_opts+=(--transform-dir $transform_dir)
   extra_opts+=(--left-context $left_context)
   extra_opts+=(--right-context $right_context)
-  extra_opts+=(--valid-left-context $((chunk_width + left_context)))
-  extra_opts+=(--valid-right-context $((chunk_width + right_context)))
 
   # Note: in RNNs we process sequences of labels rather than single label per sample
   echo "$0: calling get_egs.sh"
diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh
index 7bc8dbd8c08..b9bb9bfd2a1 100755
--- a/egs/wsj/s5/steps/nnet3/make_denlats.sh
+++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh
@@ -3,9 +3,12 @@
 #           2014-2015   Vimal Manohar
 # Apache 2.0.
 
-# Create denominator lattices for MMI/MPE training.
+# Create denominator lattices for MMI/MPE training [deprecated].
 # This version uses the neural-net models (version 3, i.e. the nnet3 code).
 # Creates its output in $dir/lat.*.gz
+# Note: the more recent discriminative training scripts will not use this
+# script at all, they'll use get_degs.sh which combines the decoding
+# and egs-dumping into one script (to save disk space and disk I/O).
 
 # Begin configuration section.
 nj=4
@@ -22,7 +25,7 @@ transform_dir=
 max_mem=20000000 # This will stop the processes getting too large.
 # This is in bytes, but not "real" bytes-- you have to multiply
 # by something like 5 or 10 to get real bytes (not sure why so large)
-num_threads=1 # Fixed to 1 for now
+num_threads=1 # number of threads of decoder [only applicable if not looped, for now]
 online_ivector_dir=
 determinize=true
 minimize=false
@@ -174,7 +177,7 @@ fi
 
 lattice_determinize_cmd=
 if $determinize; then
-  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune --beam=$beam ark:- ark:- |"
+  lattice_determinize_cmd="lattice-determinize-non-compact --acoustic-scale=$acwt --max-mem=$max_mem --minimize=$minimize --prune=true --beam=$lattice_beam ark:- ark:- |"
 fi
 
 if [ $sub_split -eq 1 ]; then
@@ -248,4 +251,3 @@ fi
 
 
 echo "$0: done generating denominator lattices."
-
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
index 7f3aba2328c..b442ce9715b 100755
--- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ./xconfig_to_configs.py
+
 # tdnn or RNN with 'jesus layer'
 
 #  inputs to jesus layer:
diff --git a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
index 471911906c5..162fda16d16 100644
--- a/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_tdnn_configs.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ../xconfig_to_configs.py
+
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import re, os, argparse, sys, math, warnings
 
 
-
 parser = argparse.ArgumentParser(description="Writes config files and variables "
                                  "for TDNNs creation and training",
                                  epilog="See steps/nnet3/train_tdnn.sh for example.");
diff --git a/egs/wsj/s5/steps/nnet3/report/generate_plots.py b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
index bf9bcd1d45c..233091f8058 100755
--- a/egs/wsj/s5/steps/nnet3/report/generate_plots.py
+++ b/egs/wsj/s5/steps/nnet3/report/generate_plots.py
@@ -48,40 +48,34 @@ def get_args():
     parser = argparse.ArgumentParser(
         description="""Parses the training logs and generates a variety of
         plots.
-        e.g. (deprecated): steps/nnet3/report/generate_plots.py \\
+        e.g.: steps/nnet3/report/generate_plots.py \\
         --comparison-dir exp/nnet3/tdnn1 --comparison-dir exp/nnet3/tdnn2 \\
-        exp/nnet3/tdnn exp/nnet3/tdnn/report
-        e.g. (current): steps/nnet3/report/generate_plots.py \\
-        exp/nnet3/tdnn exp/nnet3/tdnn1 exp/nnet3/tdnn2 exp/nnet3/tdnn/report""")
+        exp/nnet3/tdnn exp/nnet3/tdnn/report""")
 
     parser.add_argument("--comparison-dir", type=str, action='append',
                         help="other experiment directories for comparison. "
-                        "These will only be used for plots, not tables"
-                        "Note: this option is deprecated.")
+                        "These will only be used for plots, not tables")
     parser.add_argument("--start-iter", type=int,
                         help="Iteration from which plotting will start",
                         default=1)
     parser.add_argument("--is-chain", type=str, default=False,
                         action=common_lib.StrToBoolAction,
-                        help="Iteration from which plotting will start")
+                        help="True if directory contains chain models")
     parser.add_argument("--output-nodes", type=str, default=None,
                         action=common_lib.NullstrToNoneAction,
                         help="""List of space separated
                         <output-node>:<objective-type> entities,
                         one for each output node""")
-    parser.add_argument("exp_dir", nargs='+',
-                        help="the first dir is the experiment directory, "
-                        "e.g. exp/nnet3/tdnn, the rest dirs (if exist) "
-                        "are other experiment directories for comparison.")
+    parser.add_argument("exp_dir",
+                        help="experiment directory, e.g. exp/nnet3/tdnn")
     parser.add_argument("output_dir",
                         help="experiment directory, "
                         "e.g. exp/nnet3/tdnn/report")
 
     args = parser.parse_args()
-    if (args.comparison_dir is not None and len(args.comparison_dir) > 6) or \
-    (args.exp_dir is not None and len(args.exp_dir) > 7):
+    if args.comparison_dir is not None and len(args.comparison_dir) > 6:
         raise Exception(
-            """max 6 comparison directories can be specified.
+            """max 6 --comparison-dir options can be specified.
             If you want to compare with more comparison_dir, you would have to
             carefully tune the plot_colors variable which specified colors used
             for plotting.""")
@@ -156,10 +150,10 @@ def latex_compliant_name(name_string):
     return node_name_string
 
 
-def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
-        file_basename='accuracy', comparison_dir=None,
-        start_iter=1, latex_report=None, output_name='output'):
-
+def generate_accuracy_plots(exp_dir, output_dir, plot, key='accuracy',
+                            file_basename='accuracy', comparison_dir=None,
+                            start_iter=1,
+                            latex_report=None, output_name='output'):
     assert start_iter >= 1
 
     if plot:
@@ -170,20 +164,22 @@ def generate_acc_logprob_plots(exp_dir, output_dir, plot, key='accuracy',
     dirs = [exp_dir] + comparison_dir
     index = 0
     for dir in dirs:
-        [report, times, data] = log_parse.generate_acc_logprob_report(dir, key,
-                output_name)
+        [accuracy_report, accuracy_times,
+         accuracy_data] = log_parse.generate_accuracy_report(dir, key,
+                                                             output_name)
         if index == 0:
             # this is the main experiment directory
             with open("{0}/{1}.log".format(output_dir,
                                            file_basename), "w") as f:
-                f.write(report)
+                f.write(accuracy_report)
 
         if plot:
             color_val = g_plot_colors[index]
-            data = np.array(data)
+            data = np.array(accuracy_data)
             if data.shape[0] == 0:
-                raise Exception("Couldn't find any rows for the"
-                        "accuracy/log-probability plot")
+                logger.warning("Couldn't find any rows for the accuracy plot, "
+                               "not generating it.");
+                return
             data = data[data[:, 0] >= start_iter, :]
             plot_handle, = plt.plot(data[:, 0], data[:, 1], color=color_val,
                                     linestyle="--",
@@ -224,6 +220,10 @@ def generate_nonlin_stats_plots(exp_dir, output_dir, plot, comparison_dir=None,
     for dir in dirs:
         stats_per_component_per_iter = (
             log_parse.parse_progress_logs_for_nonlinearity_stats(dir))
+        for key in stats_per_component_per_iter:
+            if len(stats_per_component_per_iter[key]['stats']) == 0:
+                logger.warning("Couldn't find any rows for the"
+                               "nonlin stats plot, not generating it")
         stats_per_dir[dir] = stats_per_component_per_iter
 
     # convert the nonlin stats into tables
@@ -354,6 +354,9 @@ def generate_clipped_proportion_plots(exp_dir, output_dir, plot,
                           " this might be because there are no "
                           "ClipGradientComponents.".format(dir))
             continue
+        if len(stats_per_dir[dir]) == 0: 
+            logger.warning("Couldn't find any rows for the"
+                           "clipped proportion plot, not generating it")
     try:
         main_cp_stats = stats_per_dir[exp_dir]['table']
     except KeyError:
@@ -594,28 +597,28 @@ def generate_plots(exp_dir, output_dir, output_names, comparison_dir=None,
     for (output_name, objective_type) in output_names:
         if objective_type == "linear":
             logger.info("Generating accuracy plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot, key='accuracy',
                 file_basename='accuracy', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
 
             logger.info("Generating log-likelihood plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot, key='log-likelihood',
                 file_basename='loglikelihood', comparison_dir=comparison_dir,
                 start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         elif objective_type == "chain":
             logger.info("Generating log-probability plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot,
                 key='log-probability', file_basename='log_probability',
                 comparison_dir=comparison_dir, start_iter=start_iter,
                 latex_report=latex_report, output_name=output_name)
         else:
             logger.info("Generating " + objective_type + " objective plots")
-            generate_acc_logprob_plots(
+            generate_accuracy_plots(
                 exp_dir, output_dir, g_plot, key='objective',
                 file_basename='objective', comparison_dir=comparison_dir,
                 start_iter=start_iter,
@@ -660,18 +663,9 @@ def main():
     else:
         output_nodes.append(('output', 'linear'))
 
-    if args.comparison_dir is not None:
-      generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
-                     comparison_dir=args.comparison_dir,
-                     start_iter=args.start_iter)
-    else:
-      if len(args.exp_dir) == 1:
-        generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
-                       start_iter=args.start_iter)
-      if len(args.exp_dir) > 1:
-        generate_plots(args.exp_dir[0], args.output_dir, output_nodes,
-                       comparison_dir=args.exp_dir[1:],
-                       start_iter=args.start_iter)
+    generate_plots(args.exp_dir, args.output_dir, output_nodes,
+                   comparison_dir=args.comparison_dir,
+                   start_iter=args.start_iter)
 
 
 if __name__ == "__main__":
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
index 48c13a1236c..5445b16e165 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
+++ b/egs/wsj/s5/steps/nnet3/tdnn/make_configs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 
+# This script is deprecated, please use ../xconfig_to_configs.py
+
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import os
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train.sh b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
index 49eeabcd9a8..f28994ae68b 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ../train_dnn.py
+
 # note, TDNN is the same as what we used to call multisplice.
 
 # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
@@ -83,6 +85,7 @@ subset_dim=0
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
@@ -108,9 +111,9 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
diff --git a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
index a04a0e894ac..b0031894a48 100755
--- a/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
+++ b/egs/wsj/s5/steps/nnet3/tdnn/train_raw_nnet.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py
+
 # note, TDNN is the same as what we used to call multisplice.
 # THIS SCRIPT IS DEPRECATED, see ../train_raw_dnn.py
 
@@ -70,6 +72,7 @@ dense_targets=true        # Use dense targets instead of sparse targets
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
@@ -94,9 +97,9 @@ if [ $# != 3 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -138,6 +141,7 @@ if [ -z "$online_ivector_dir" ]; then
   ivector_dim=0
 else
   ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1;
+  steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/final.ie.id || exit 1
 fi
 
 if [ ! -z "$configs_dir" ]; then
@@ -210,6 +214,11 @@ fi
 
 [ -z $egs_dir ] && egs_dir=$dir/egs
 
+if [ ! -z "$online_ivector_dir" ] ; then
+  steps/nnet2/check_ivectors_compatible.sh $online_ivector_dir $egs_dir/info || exit 1
+fi
+
+
 if [ "$feat_dim" != "$(cat $egs_dir/info/feat_dim)" ]; then
   echo "$0: feature dimension mismatch with egs, $feat_dim vs $(cat $egs_dir/info/feat_dim)";
   exit 1;
diff --git a/egs/wsj/s5/steps/nnet3/train_discriminative.sh b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
index b0bf2a2aad6..bdee5a54e4d 100755
--- a/egs/wsj/s5/steps/nnet3/train_discriminative.sh
+++ b/egs/wsj/s5/steps/nnet3/train_discriminative.sh
@@ -17,8 +17,6 @@ num_epochs=4       # Number of epochs of training;
                    # num-epochs * frame-subsampling-factor times, due to
                    # using different data-shifts.
 use_gpu=true
-truncate_deriv_weights=0  # can be used to set to zero the weights of derivs from frames
-                          # near the edges.  (counts subsampled frames).
 apply_deriv_weights=true
 use_frame_shift=false
 run_diagnostics=true
@@ -39,7 +37,6 @@ num_jobs_nnet=4    # Number of neural net jobs to run in parallel.  Note: this
                    # versa).
 regularization_opts=
 minibatch_size=64  # This is the number of examples rather than the number of output frames.
-modify_learning_rates=false   # [deprecated]
 last_layer_factor=1.0  # relates to modify-learning-rates [deprecated]
 shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the samples
                 # on each iter.  You could set it to 0 or to a large value for complete
@@ -50,17 +47,18 @@ shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of
 
 stage=-3
 
-adjust_priors=true
 num_threads=16  # this is the default but you may want to change it, e.g. to 1 if
                 # using GPUs.
 
 cleanup=true
-keep_model_iters=1
+keep_model_iters=100
 remove_egs=false
 src_model=  # will default to $degs_dir/final.mdl
 
-left_deriv_truncate=   # number of time-steps to avoid using the deriv of, on the left.
-right_deriv_truncate=  # number of time-steps to avoid using the deriv of, on the right.
+num_jobs_compute_prior=10
+
+min_deriv_time=0
+max_deriv_time_relative=0
 # End configuration section.
 
 
@@ -109,12 +107,18 @@ dir=$2
 [ -z "$src_model" ] && src_model=$degs_dir/final.mdl
 
 # Check some files.
-for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frames_per_eg,egs_per_archive} $src_model; do
+for f in $degs_dir/degs.1.ark $degs_dir/info/{num_archives,silence.csl,frame_subsampling_factor} $src_model; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
 mkdir -p $dir/log || exit 1;
 
+
+model_left_context=$(nnet3-am-info $src_model | grep "^left-context:" | awk '{print $2}')
+model_right_context=$(nnet3-am-info $src_model | grep "^right-context:" | awk '{print $2}')
+
+
+
 # copy some things
 for f in splice_opts cmvn_opts tree final.mat; do
   if [ -f $degs_dir/$f ]; then
@@ -124,12 +128,6 @@ done
 
 silphonelist=`cat $degs_dir/info/silence.csl` || exit 1;
 
-num_archives_priors=0
-if $adjust_priors; then
-  num_archives_priors=`cat $degs_dir/info/num_archives_priors` || exit 1
-fi
-
-frames_per_eg=$(cat $degs_dir/info/frames_per_eg) || { echo "error: no such file $degs_dir/info/frames_per_eg"; exit 1; }
 num_archives=$(cat $degs_dir/info/num_archives) || exit 1;
 frame_subsampling_factor=$(cat $degs_dir/info/frame_subsampling_factor)
 
@@ -194,19 +192,26 @@ if [ $stage -le -1 ]; then
     echo "$0: setting learning rate to $learning_rate = --num-jobs-nnet * --effective-lrate."
   fi
 
+
+  # set the learning rate to $learning_rate, and
+  # set the output-layer's learning rate to
+  # $learning_rate times $last_layer_factor.
+  edits_str="set-learning-rate learning-rate=$learning_rate"
+  if [ "$last_layer_factor" != "1.0" ]; then
+    last_layer_lrate=$(perl -e "print ($learning_rate*$last_layer_factor);") || exit 1
+    edits_str="$edits_str; set-learning-rate name=output.affine learning-rate=$last_layer_lrate"
+  fi
+
   $cmd $dir/log/convert.log \
-    nnet3-am-copy --learning-rate=$learning_rate "$src_model" $dir/0.mdl || exit 1;
+    nnet3-am-copy --edits="$edits_str" "$src_model" $dir/0.mdl || exit 1;
+
+  ln -sf 0.mdl $dir/epoch0.mdl
 fi
 
 
 rm $dir/.error 2>/dev/null
 
-x=0   
-
-deriv_time_opts=
-[ ! -z "$left_deriv_truncate" ] && deriv_time_opts="--optimization.min-deriv-time=$left_deriv_truncate"
-[ ! -z "$right_deriv_truncate" ] && \
-  deriv_time_opts="$deriv_time_opts --optimization.max-deriv-time=$((frames_per_eg - right_deriv_truncate))"
+x=0
 
 while [ $x -lt $num_iters ]; do
   if [ $stage -le $x ]; then
@@ -229,7 +234,7 @@ while [ $x -lt $num_iters ]; do
         $dir/$x.mdl \
         ark:$degs_dir/train_diagnostic.degs &
     fi
-    
+
     if [ $x -gt 0 ]; then
       $cmd $dir/log/progress.$x.log \
         nnet3-show-progress --use-gpu=no "nnet3-am-copy --raw=true $dir/$[$x-1].mdl - |" "nnet3-am-copy --raw=true $dir/$x.mdl - |" \
@@ -239,9 +244,9 @@ while [ $x -lt $num_iters ]; do
 
 
     echo "Training neural net (pass $x)"
-      
+
     cache_read_opt="--read-cache=$dir/cache.$x"
-    
+
     ( # this sub-shell is so that when we "wait" below,
       # we only wait for the training jobs that we just spawned,
       # not the diagnostic jobs that we spawned above.
@@ -253,7 +258,7 @@ while [ $x -lt $num_iters ]; do
         k=$[$num_archives_processed + $n - 1]; # k is a zero-based index that we'll derive
                                                # the other indexes from.
         archive=$[($k%$num_archives)+1]; # work out the 1-based archive index.
-        
+
         if [ $n -eq 1 ]; then
           # an option for writing cache (storing pairs of nnet-computations and
           # computation-requests) during training.
@@ -263,11 +268,7 @@ while [ $x -lt $num_iters ]; do
         fi
 
         if $use_frame_shift; then
-          if [ $[num_archives % frame_subsampling_factor] -ne 0 ]; then
-            frame_shift=$[k % frame_subsampling_factor]
-          else
-            frame_shift=$[(k + k/num_archives) % frame_subsampling_factor]
-          fi
+          frame_shift=$[(k%num_archives + k/num_archives) % frame_subsampling_factor]
         else
           frame_shift=0
         fi
@@ -282,14 +283,16 @@ while [ $x -lt $num_iters ]; do
         $cmd $train_queue_opt $dir/log/train.$x.$n.log \
           nnet3-discriminative-train $cache_read_opt $cache_write_opt \
           --apply-deriv-weights=$apply_deriv_weights \
-          $parallel_train_opts $deriv_time_opts \
+          --optimization.min-deriv-time=-$model_left_context \
+          --optimization.max-deriv-time-relative=$model_right_context \
+            $parallel_train_opts \
           --max-param-change=$this_max_param_change \
           --silence-phones=$silphonelist \
           --criterion=$criterion --drop-frames=$drop_frames \
           --one-silence-class=$one_silence_class \
           --boost=$boost --acoustic-scale=$acoustic_scale $regularization_opts \
           $dir/$x.mdl \
-          "ark:nnet3-discriminative-copy-egs --frame-shift=$frame_shift --truncate-deriv-weights=$truncate_deriv_weights ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
+          "ark,bg:nnet3-discriminative-copy-egs --frame-shift=$frame_shift ark:$degs_dir/degs.$archive.ark ark:- | nnet3-discriminative-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-discriminative-merge-egs --minibatch-size=$minibatch_size ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
@@ -306,28 +309,11 @@ while [ $x -lt $num_iters ]; do
       nnet3-am-copy --set-raw-nnet=- $dir/$x.mdl $dir/$[$x+1].mdl || exit 1;
 
     rm $nnets_list
-
-    if [ ! -z "${iter_to_epoch[$x]}" ]; then
-      e=${iter_to_epoch[$x]}
-      ln -sf $x.mdl $dir/epoch$e.mdl
-    fi
-
-    if $adjust_priors && [ ! -z "${iter_to_epoch[$x]}" ]; then
-      if [ ! -f $degs_dir/priors_egs.1.ark ]; then
-        echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
-        echo "$0: Run this script with --adjust-priors false to not adjust priors"
-        exit 1
-      fi
-      (
-        e=${iter_to_epoch[$x]}
-        rm $dir/.error 2> /dev/null
-
-        steps/nnet3/adjust_priors.sh --egs-type priors_egs \
-          --num-jobs-compute-prior $num_archives_priors \
-          --cmd "$cmd" --use-gpu false \
-          --use-raw-nnet false --iter epoch$e $dir $degs_dir \
-          || { touch $dir/.error; echo "Error in adjusting priors. See $dir/log/adjust_priors.epoch$e.log"; exit 1; }
-      ) &
+    [ ! -f $dir/$[$x+1].mdl ] && echo "$0: Did not create $dir/$[$x+1].mdl" && exit 1;
+    if [ -f $dir/$[$x-1].mdl ] && $cleanup && \
+       [ $[($x-1)%$keep_model_iters] -ne 0  ] && \
+       [ -z "${iter_to_epoch[$[$x-1]]}" ]; then
+      rm $dir/$[$x-1].mdl
     fi
 
     [ -f $dir/.error ] && { echo "Found $dir/.error. Error on iteration $x"; exit 1; }
@@ -336,28 +322,27 @@ while [ $x -lt $num_iters ]; do
   rm $dir/cache.$x 2>/dev/null || true
   x=$[$x+1]
   num_archives_processed=$[num_archives_processed+num_jobs_nnet]
-done
 
-rm $dir/final.mdl 2>/dev/null
-cp $dir/$x.mdl $dir/final.mdl
-ln -sf final.mdl $dir/epoch$num_epochs_expanded.mdl
+  if [ $stage -le $x ] && [ ! -z "${iter_to_epoch[$x]}" ]; then
+    e=${iter_to_epoch[$x]}
+    ln -sf $x.mdl $dir/epoch$e.mdl
 
-if $adjust_priors && [ $stage -le $num_iters ]; then
-  if [ ! -f $degs_dir/priors_egs.1.ark ]; then
-    echo "$0: Expecting $degs_dir/priors_egs.1.ark to exist since --adjust-priors was true."
-    echo "$0: Run this script with --adjust-priors false to not adjust priors"
-    exit 1
-  fi
+    (
+      rm $dir/.error 2> /dev/null
 
-  steps/nnet3/adjust_priors.sh --egs-type priors_egs \
-    --num-jobs-compute-prior $num_archives_priors \
-    --cmd "$cmd $prior_queue_opt" --use-gpu false \
-    --use-raw-nnet false --iter epoch$num_epochs_expanded \
-    $dir $degs_dir || exit 1
-fi
+      steps/nnet3/adjust_priors.sh --egs-type degs \
+        --num-jobs-compute-prior $num_jobs_compute_prior \
+        --cmd "$cmd" --use-gpu false \
+        --minibatch-size $minibatch_size \
+        --use-raw-nnet false --iter epoch$e $dir $degs_dir \
+        || { touch $dir/.error; echo "Error in adjusting priors. See errors above."; exit 1; }
+    ) &
+  fi
 
-echo Done
+done
 
+rm $dir/final.mdl 2>/dev/null
+cp $dir/$x.mdl $dir/final.mdl
 
 # function to remove egs that might be soft links.
 remove () { for x in $*; do [ -L $x ] && rm $(readlink -f $x); rm $x; done }
@@ -379,3 +364,7 @@ if $cleanup; then
   done
 fi
 
+wait
+[ -f $dir/.error ] && { echo "Found $dir/.error."; exit 1; }
+
+echo Done && exit 0
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index ca495654819..7f52d9f8f26 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -26,7 +26,7 @@
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                               "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
@@ -36,19 +36,18 @@
 def get_args():
     """ Get args from stdin.
 
-    We add compulsary arguments as named arguments for readability
+    We add compulsory arguments as named arguments for readability
 
     The common options are defined in the object
     libs.nnet3.train.common.CommonParser.parser.
     See steps/libs/nnet3/train/common.py
     """
-
     parser = argparse.ArgumentParser(
         description="""Trains a feed forward DNN acoustic model using the
         cross-entropy objective.  DNNs include simple DNNs, TDNNs and CNNs.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(include_chunk_context = False).parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -66,9 +65,12 @@ def get_args():
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size",
-                        type=float, dest='minibatch_size', default=512,
-                        help="Size of the minibatch used to compute the "
-                        "gradient")
+                        type=str, dest='minibatch_size', default='512',
+                        help="""Size of the minibatch used in SGD training
+                        (argument to nnet3-merge-egs); may be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
 
     # General options
     parser.add_argument("--feat-dir", type=str, required=True,
@@ -100,6 +102,9 @@ def process_args(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
+    if not common_train_lib.validate_minibatch_size_str(args.minibatch_size):
+        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
+
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -160,6 +165,7 @@ def train(args, run_opts, background_process_handler):
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
@@ -185,15 +191,15 @@ def train(args, run_opts, background_process_handler):
         raise Exception("KeyError {0}: Variables need to be defined in "
                         "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
+    left_context = model_left_context
+    right_context = model_right_context
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
     # we do this as it's a convenient way to get the stats for the 'lda-like'
     # transform.
 
-    if (args.stage <= -5):
+    if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"):
         logger.info("Initializing a basic network for estimating "
                     "preconditioning matrix")
         common_lib.run_job(
@@ -209,9 +215,8 @@ def train(args, run_opts, background_process_handler):
         train_lib.acoustic_model.generate_egs(
             data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context, valid_right_context=right_context,
             run_opts=run_opts,
-            frames_per_eg=args.frames_per_eg,
+            frames_per_eg_str=str(args.frames_per_eg),
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -226,10 +231,11 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+     frames_per_eg_str, num_archives] = (
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
                                         left_context, right_context))
-    assert(args.frames_per_eg == frames_per_eg)
+    assert(str(args.frames_per_eg) == frames_per_eg_str)
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
@@ -239,7 +245,7 @@ def train(args, run_opts, background_process_handler):
     # use during decoding
     common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir)
 
-    if (args.stage <= -3):
+    if (args.stage <= -3) and os.path.exists(args.dir+"/configs/init.config"):
         logger.info('Computing the preconditioning matrix for input features')
 
         train_lib.common.compute_preconditioning_matrix(
@@ -267,7 +273,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_expanded = num_archives * args.frames_per_eg
-    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
@@ -298,6 +304,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                * float(iter) / num_iters)
 
         if args.stage <= iter:
+            logger.info("On iteration {0}, learning rate is {1}.".format(
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed)))
+
             train_lib.common.train_one_iteration(
                 dir=args.dir,
                 iter=iter,
@@ -308,11 +318,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 num_archives=num_archives,
                 learning_rate=learning_rate(iter, current_num_jobs,
                                             num_archives_processed),
-                dropout_edit_string=common_train_lib.get_dropout_edit_string(
-                    args.dropout_schedule,
-                    float(num_archives_processed) / num_archives_to_process,
-                    iter),
-                minibatch_size=args.minibatch_size,
+                minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
@@ -336,7 +342,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 if iter % reporting_iter_interval == 0:
                     # lets do some reporting
                     [report, times, data] = (
-                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
                     subject = ("Update : Expt {dir} : "
                                "Iter {iter}".format(dir=args.dir, iter=iter))
@@ -351,8 +357,9 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             models_to_combine=models_to_combine,
             egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
-            run_opts=run_opts,
-            background_process_handler=background_process_handler)
+            minibatch_size_str=args.minibatch_size, run_opts=run_opts,
+            background_process_handler=background_process_handler,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
@@ -385,7 +392,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             remove_egs=remove_egs)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
         common_lib.send_mail(report, "Update : Expt {0} : "
                                      "complete".format(args.dir), args.email)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 21cbca64e7a..e65a690101a 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -26,7 +26,7 @@
 logger.setLevel(logging.INFO)
 handler = logging.StreamHandler()
 handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+formatter = logging.Formatter("%(asctime)s [%(filename)s:%(lineno)s - "
                               "%(funcName)s - %(levelname)s ] %(message)s")
 handler.setFormatter(formatter)
 logger.addHandler(handler)
@@ -47,7 +47,7 @@ def get_args():
         DNNs include simple DNNs, TDNNs and CNNs.""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(include_chunk_context = False).parser])
 
     # egs extraction options
     parser.add_argument("--egs.frames-per-eg", type=int, dest='frames_per_eg',
@@ -65,9 +65,12 @@ def get_args():
 
     # Parameters for the optimization
     parser.add_argument("--trainer.optimization.minibatch-size",
-                        type=float, dest='minibatch_size', default=512,
-                        help="Size of the minibatch used to compute the "
-                        "gradient")
+                        type=str, dest='minibatch_size', default='512',
+                        help="""Size of the minibatch used in SGD training
+                        (argument to nnet3-merge-egs); may be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
 
     # General options
     parser.add_argument("--nj", type=int, default=4,
@@ -102,6 +105,9 @@ def process_args(args):
     if args.frames_per_eg < 1:
         raise Exception("--egs.frames-per-eg should have a minimum value of 1")
 
+    if not common_train_lib.validate_minibatch_size_str(args.minibatch_size):
+        raise Exception("--trainer.optimization.minibatch-size has an invalid value");
+
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -158,6 +164,7 @@ def train(args, run_opts, background_process_handler):
     # Set some variables.
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
@@ -178,8 +185,8 @@ def train(args, run_opts, background_process_handler):
         raise Exception("KeyError {0}: Variables need to be defined in "
                         "{1}".format(str(e), '{0}/configs'.format(args.dir)))
 
-    left_context = args.chunk_left_context + model_left_context
-    right_context = args.chunk_right_context + model_right_context
+    left_context = model_left_context
+    right_context = model_right_context
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -221,9 +228,8 @@ def train(args, run_opts, background_process_handler):
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
             left_context=left_context, right_context=right_context,
-            valid_left_context=left_context, valid_right_context=right_context,
             run_opts=run_opts,
-            frames_per_eg=args.frames_per_eg,
+            frames_per_eg_str=str(args.frames_per_eg),
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -240,10 +246,11 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+     frames_per_eg_str, num_archives] = (
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
                                         left_context, right_context))
-    assert(args.frames_per_eg == frames_per_eg)
+    assert(str(args.frames_per_eg) == frames_per_eg_str)
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
@@ -270,7 +277,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
     num_archives_expanded = num_archives * args.frames_per_eg
-    num_archives_to_process = args.num_epochs * num_archives_expanded
+    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
@@ -301,6 +308,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                * float(iter) / num_iters)
 
         if args.stage <= iter:
+            logger.info("On iteration {0}, learning rate is {1}.".format(
+                iter, learning_rate(iter, current_num_jobs,
+                                    num_archives_processed)))
+
             train_lib.common.train_one_iteration(
                 dir=args.dir,
                 iter=iter,
@@ -311,11 +322,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 num_archives=num_archives,
                 learning_rate=learning_rate(iter, current_num_jobs,
                                             num_archives_processed),
-                dropout_edit_string=common_train_lib.get_dropout_edit_string(
-                    args.dropout_schedule,
-                    float(num_archives_processed) / num_archives_to_process,
-                    iter),
-                minibatch_size=args.minibatch_size,
+                minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
@@ -341,7 +348,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                 if iter % reporting_iter_interval == 0:
                     # lets do some reporting
                     [report, times, data] = (
-                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
+                        nnet3_log_parse.generate_accuracy_report(args.dir))
                     message = report
                     subject = ("Update : Expt {dir} : "
                                "Iter {iter}".format(dir=args.dir, iter=iter))
@@ -355,9 +362,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine, egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
-            run_opts=run_opts,
+            minibatch_size_str=args.minibatch_size, run_opts=run_opts,
             background_process_handler=background_process_handler,
-            get_raw_nnet_from_am=False)
+            get_raw_nnet_from_am=False,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
@@ -385,7 +393,7 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             get_raw_nnet_from_am=False)
 
     # do some reporting
-    [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
+    [report, times, data] = nnet3_log_parse.generate_accuracy_report(args.dir)
     if args.email is not None:
         common_lib.send_mail(report, "Update : Expt {0} : "
                                      "complete".format(args.dir), args.email)
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index e8a48653a5a..272485b898a 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -55,20 +55,16 @@ def get_args():
             3. RNNs can also be trained with state preservation training""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(default_chunk_left_context=40).parser])
 
     # egs extraction options
-    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default=20,
-                        help="""Number of output labels in the sequence
-                        used to train an LSTM.
-                        Caution: if you double this you should halve
-                        --trainer.samples-per-iter.""")
-    parser.add_argument("--egs.chunk-left-context", type=int,
-                        dest='chunk_left_context', default=40,
-                        help="""Number of left steps used in the estimation of
-                        LSTM state before prediction of the first label.
-                        Overrides the default value in CommonParser""")
+    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
+                        default="20",
+                        help="""Number of frames per chunk in the examples
+                        used to train the RNN.   Caution: if you double this you
+                        should halve --trainer.samples-per-iter.  May be
+                        a comma-separated list of alternatives: first width
+                        is the 'principal' chunk-width, used preferentially""")
 
     # trainer options
     parser.add_argument("--trainer.samples-per-iter", type=int,
@@ -111,20 +107,14 @@ def get_args():
                         steps/nnet3/get_saturation.pl) exceeds this threshold
                         we scale the parameter matrices with the
                         shrink-value.""")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
-                        dest='cv_minibatch_size', default=256,
-                        help="""Size of the minibatch to be used in diagnostic
-                        jobs (use smaller value for BLSTMs to control memory
-                        usage)""")
-
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=100,
-                        help="Number of sequences to be processed in "
-                        "parallel every minibatch")
-    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
-                        dest='num_bptt_steps', default=None,
-                        help="""Deprecated. Kept for back compatibility.""")
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
+                        dest='num_chunk_per_minibatch', default='100',
+                        help="""Number of sequences to be processed in
+                        parallel every minibatch.  May be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                         dest='deriv_truncate_margin', default=8,
                         help="""Margin (in input frames) around the 'required'
@@ -163,8 +153,11 @@ def process_args(args):
     """ Process the options got from get_args()
     """
 
-    if args.chunk_width < 1:
-        raise Exception("--egs.chunk-width should have a minimum value of 1")
+    if not common_train_lib.validate_chunk_width(args.chunk_width):
+        raise Exception("--egs.chunk-width has an invalid value");
+
+    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
+        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
 
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
@@ -172,17 +165,6 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if args.num_bptt_steps is not None:
-        # -2 is used to compensate for the splicing of the input frame,
-        # assuming that splicing spans from -2 to 2
-        args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
-        logger.warning(
-            "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
-            "--trainer.deriv-truncate-margin is set to (num-bptt-steps - "
-            "chunk-width - 2) = {0}. We recommend using the option "
-            "--trainer.deriv-truncate-margin.".format(
-                args.deriv_truncate_margin))
-
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -239,6 +221,8 @@ def train(args, run_opts, background_process_handler):
     # Set some variables.
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
+
 
     config_dir = '{0}/configs'.format(args.dir)
     var_file = '{0}/vars'.format(config_dir)
@@ -261,6 +245,10 @@ def train(args, run_opts, background_process_handler):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    left_context_initial = (args.chunk_left_context_initial + model_left_context if
+                            args.chunk_left_context_initial >= 0 else -1)
+    right_context_final = (args.chunk_right_context_final + model_right_context if
+                           args.chunk_right_context_final >= 0 else -1)
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -301,11 +289,12 @@ def train(args, run_opts, background_process_handler):
         train_lib.raw_model.generate_egs_using_targets(
             data=args.feat_dir, targets_scp=args.targets_scp,
             egs_dir=default_egs_dir,
-            left_context=left_context, right_context=right_context,
-            valid_left_context=left_context + args.chunk_width,
-            valid_right_context=right_context + args.chunk_width,
+            left_context=left_context,
+            right_context=right_context,
+            left_context_initial=left_context_initial,
+            right_context_final=right_context_final,
             run_opts=run_opts,
-            frames_per_eg=args.chunk_width,
+            frames_per_eg_str=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -322,10 +311,14 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
+     frames_per_eg_str, num_archives] = (
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim,
+                                        ivector_dim, ivector_id,
                                         left_context, right_context))
-    assert(args.chunk_width == frames_per_eg)
+    if args.chunk_width != frames_per_eg_str:
+        raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
+                        "in the egs dir {0} vs {1}".format(args.chunk_width,
+                                                     frames_per_eg_str))
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
@@ -351,7 +344,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
-    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_to_process = int(args.num_epochs * num_archives)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
@@ -371,11 +364,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.final_effective_lrate)
 
     min_deriv_time = None
-    max_deriv_time = None
+    max_deriv_time_relative = None
     if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
-                          + model_right_context)
+        max_deriv_time_relative = \
+           args.deriv_truncate_margin + model_right_context
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -416,17 +409,16 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
                 shrinkage_value=shrinkage_value,
-                minibatch_size=args.num_chunk_per_minibatch,
+                minibatch_size_str=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
                 right_context=right_context,
                 min_deriv_time=min_deriv_time,
-                max_deriv_time=max_deriv_time,
+                max_deriv_time_relative=max_deriv_time_relative,
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                cv_minibatch_size=args.cv_minibatch_size,
                 run_opts=run_opts,
                 get_raw_nnet_from_am=False,
                 background_process_handler=background_process_handler)
@@ -458,9 +450,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             dir=args.dir, num_iters=num_iters,
             models_to_combine=models_to_combine, egs_dir=egs_dir,
             left_context=left_context, right_context=right_context,
+            minibatch_size_str=args.num_chunk_per_minibatch,
             run_opts=run_opts, chunk_width=args.chunk_width,
             background_process_handler=background_process_handler,
-            get_raw_nnet_from_am=False)
+            get_raw_nnet_from_am=False,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if include_log_softmax and args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index d08585fa537..6636513e03d 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -56,20 +56,16 @@ def get_args():
             3. RNNs can also be trained with state preservation training""",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         conflict_handler='resolve',
-        parents=[common_train_lib.CommonParser().parser])
+        parents=[common_train_lib.CommonParser(default_chunk_left_context = 40).parser])
 
     # egs extraction options
-    parser.add_argument("--egs.chunk-width", type=int, dest='chunk_width',
-                        default=20,
-                        help="""Number of output labels in the sequence
-                        used to train an LSTM.
-                        Caution: if you double this you should halve
-                        --trainer.samples-per-iter.""")
-    parser.add_argument("--egs.chunk-left-context", type=int,
-                        dest='chunk_left_context', default=40,
-                        help="""Number of left steps used in the estimation of
-                        LSTM state before prediction of the first label""")
-
+    parser.add_argument("--egs.chunk-width", type=str, dest='chunk_width',
+                        default="20",
+                        help="""Number of frames per chunk in the examples
+                        used to train the RNN.   Caution: if you double this you
+                        should halve --trainer.samples-per-iter.  May be
+                        a comma-separated list of alternatives: first width
+                        is the 'principal' chunk-width, used preferentially""")
     parser.add_argument("--trainer.samples-per-iter", type=int,
                         dest='samples_per_iter', default=20000,
                         help="""This is really the number of egs in each
@@ -110,20 +106,14 @@ def get_args():
                         steps/nnet3/get_saturation.pl) exceeds this threshold
                         we scale the parameter matrices with the
                         shrink-value.""")
-    parser.add_argument("--trainer.optimization.cv-minibatch-size", type=int,
-                        dest='cv_minibatch_size', default=256,
-                        help="""Size of the minibatch to be used in diagnostic
-                        jobs (use smaller value for BLSTMs to control memory
-                        usage)""")
-
     # RNN specific trainer options
-    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=int,
-                        dest='num_chunk_per_minibatch', default=100,
-                        help="Number of sequences to be processed in "
-                        "parallel every minibatch")
-    parser.add_argument("--trainer.rnn.num-bptt-steps", type=int,
-                        dest='num_bptt_steps', default=None,
-                        help="""Deprecated. Kept for back compatibility.""")
+    parser.add_argument("--trainer.rnn.num-chunk-per-minibatch", type=str,
+                        dest='num_chunk_per_minibatch', default='100',
+                        help="""Number of sequences to be processed in
+                        parallel every minibatch.  May be a more general
+                        rule as accepted by the --minibatch-size option of
+                        nnet3-merge-egs; run that program without args to see
+                        the format.""")
     parser.add_argument("--trainer.deriv-truncate-margin", type=int,
                         dest='deriv_truncate_margin', default=8,
                         help="""Margin (in input frames) around the 'required'
@@ -159,8 +149,11 @@ def process_args(args):
     """ Process the options got from get_args()
     """
 
-    if args.chunk_width < 1:
-        raise Exception("--egs.chunk-width should have a minimum value of 1")
+    if not common_train_lib.validate_chunk_width(args.chunk_width):
+        raise Exception("--egs.chunk-width has an invalid value");
+
+    if not common_train_lib.validate_minibatch_size_str(args.num_chunk_per_minibatch):
+        raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value");
 
     if args.chunk_left_context < 0:
         raise Exception("--egs.chunk-left-context should be non-negative")
@@ -168,17 +161,6 @@ def process_args(args):
     if args.chunk_right_context < 0:
         raise Exception("--egs.chunk-right-context should be non-negative")
 
-    if args.num_bptt_steps is not None:
-        # -2 is used to compensate for the splicing of the input frame,
-        # assuming that splicing spans from -2 to 2
-        args.deriv_truncate_margin = args.num_bptt_steps - args.chunk_width - 2
-        logger.warning(
-            "--trainer.rnn.num-bptt-steps (deprecated) is set by user, and "
-            "--trainer.deriv-truncate-margin is set to (num-bptt-steps - "
-            "chunk-width - 2) = {0}. We recommend using the option "
-            "--trainer.deriv-truncate-margin.".format(
-                args.deriv_truncate_margin))
-
     if (not os.path.exists(args.dir)
             or not os.path.exists(args.dir+"/configs")):
         raise Exception("This scripts expects {0} to exist and have a configs "
@@ -239,6 +221,7 @@ def train(args, run_opts, background_process_handler):
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
     ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir)
+    ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir)
 
     # split the training data into parts for individual jobs
     # we will use the same number of jobs as that used for alignment
@@ -266,6 +249,10 @@ def train(args, run_opts, background_process_handler):
 
     left_context = args.chunk_left_context + model_left_context
     right_context = args.chunk_right_context + model_right_context
+    left_context_initial = (args.chunk_left_context_initial + model_left_context if
+                            args.chunk_left_context_initial >= 0 else -1)
+    right_context_final = (args.chunk_right_context_final + model_right_context if
+                           args.chunk_right_context_final >= 0 else -1)
 
     # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
     # matrix.  This first config just does any initial splicing that we do;
@@ -286,12 +273,14 @@ def train(args, run_opts, background_process_handler):
         logger.info("Generating egs")
 
         train_lib.acoustic_model.generate_egs(
-            data=args.feat_dir, alidir=args.ali_dir, egs_dir=default_egs_dir,
-            left_context=left_context, right_context=right_context,
-            valid_left_context=left_context + args.chunk_width,
-            valid_right_context=right_context + args.chunk_width,
+            data=args.feat_dir, alidir=args.ali_dir,
+            egs_dir=default_egs_dir,
+            left_context=left_context,
+            right_context=right_context,
+            left_context_initial=left_context_initial,
+            right_context_final=right_context_final,
             run_opts=run_opts,
-            frames_per_eg=args.chunk_width,
+            frames_per_eg_str=args.chunk_width,
             srand=args.srand,
             egs_opts=args.egs_opts,
             cmvn_opts=args.cmvn_opts,
@@ -306,10 +295,15 @@ def train(args, run_opts, background_process_handler):
         egs_dir = args.egs_dir
 
     [egs_left_context, egs_right_context,
-     frames_per_eg, num_archives] = (
-        common_train_lib.verify_egs_dir(egs_dir, feat_dim, ivector_dim,
-                                        left_context, right_context))
-    assert(args.chunk_width == frames_per_eg)
+     frames_per_eg_str, num_archives] = (
+        common_train_lib.verify_egs_dir(egs_dir, feat_dim, 
+                                        ivector_dim, ivector_id,
+                                        left_context, right_context,
+                                        left_context_initial, right_context_final))
+    if args.chunk_width != frames_per_eg_str:
+        raise Exception("mismatch between --egs.chunk-width and the frames_per_eg "
+                        "in the egs dir {0} vs {1}".format(args.chunk_width,
+                                                           frames_per_eg_str))
 
     if (args.num_jobs_final > num_archives):
         raise Exception('num_jobs_final cannot exceed the number of archives '
@@ -346,7 +340,7 @@ def train(args, run_opts, background_process_handler):
     # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
     # $num_epochs*$num_archives, where
     # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
-    num_archives_to_process = args.num_epochs * num_archives
+    num_archives_to_process = int(args.num_epochs * num_archives)
     num_archives_processed = 0
     num_iters = ((num_archives_to_process * 2)
                  / (args.num_jobs_initial + args.num_jobs_final))
@@ -366,11 +360,11 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                                                   args.final_effective_lrate)
 
     min_deriv_time = None
-    max_deriv_time = None
+    max_deriv_time_relative = None
     if args.deriv_truncate_margin is not None:
         min_deriv_time = -args.deriv_truncate_margin - model_left_context
-        max_deriv_time = (args.chunk_width - 1 + args.deriv_truncate_margin
-                          + model_right_context)
+        max_deriv_time_relative = \
+           args.deriv_truncate_margin + model_right_context
 
     logger.info("Training will run for {0} epochs = "
                 "{1} iterations".format(args.num_epochs, num_iters))
@@ -410,17 +404,16 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
                 shrinkage_value=shrinkage_value,
-                minibatch_size=args.num_chunk_per_minibatch,
+                minibatch_size_str=args.num_chunk_per_minibatch,
                 num_hidden_layers=num_hidden_layers,
                 add_layers_period=args.add_layers_period,
                 left_context=left_context,
                 right_context=right_context,
                 min_deriv_time=min_deriv_time,
-                max_deriv_time=max_deriv_time,
+                max_deriv_time_relative=max_deriv_time_relative,
                 momentum=args.momentum,
                 max_param_change=args.max_param_change,
                 shuffle_buffer_size=args.shuffle_buffer_size,
-                cv_minibatch_size=args.cv_minibatch_size,
                 run_opts=run_opts,
                 background_process_handler=background_process_handler)
 
@@ -451,8 +444,10 @@ def learning_rate(iter, current_num_jobs, num_archives_processed):
             models_to_combine=models_to_combine, egs_dir=egs_dir,
             run_opts=run_opts,
             left_context=left_context, right_context=right_context,
+            minibatch_size_str=args.num_chunk_per_minibatch,
             background_process_handler=background_process_handler,
-            chunk_width=args.chunk_width)
+            chunk_width=args.chunk_width,
+            sum_to_one_penalty=args.combine_sum_to_one_penalty)
 
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index fb7a5a38f49..fbcf426b205 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# THIS SCRIPT IS DEPRECATED, see ./train_dnn.py
+
 # note, TDNN is the same as what we used to call multisplice.
 
 # Copyright 2012-2015  Johns Hopkins University (Author: Daniel Povey).
@@ -81,6 +83,7 @@ frames_per_eg=8 # to be passed on to get_egs.sh
 
 trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
 
+echo "$0: THIS SCRIPT IS DEPRECATED"
 echo "$0 $@"  # Print the command line for logging
 
 if [ -f path.sh ]; then . ./path.sh; fi
@@ -106,9 +109,9 @@ if [ $# != 4 ]; then
   echo "  --num-threads <num-threads|16>                   # Number of parallel threads per job, for CPU-based training (will affect"
   echo "                                                   # results as well as speed; may interact with batch size; if you increase"
   echo "                                                   # this, you may want to decrease the batch size."
-  echo "  --parallel-opts <opts|\"-pe smp 16 -l ram_free=1G,mem_free=1G\">      # extra options to pass to e.g. queue.pl for processes that"
-  echo "                                                   # use multiple threads... note, you might have to reduce mem_free,ram_free"
-  echo "                                                   # versus your defaults, because it gets multiplied by the -pe smp argument."
+  echo "  --parallel-opts <opts|\"--num-threads 16 --mem 1G\">      # extra options to pass to e.g. queue.pl for processes that"
+  echo "                                                   # use multiple threads... note, you might have to reduce --mem"
+  echo "                                                   # versus your defaults, because it gets multiplied by the --num-threads argument."
   echo "  --minibatch-size <minibatch-size|128>            # Size of minibatch to process (note: product with --num-threads"
   echo "                                                   # should not get too large, e.g. >2k)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
@@ -654,6 +657,6 @@ if $cleanup; then
   done
 fi
 
-steps/info/nnet3_dir_info.sh $dir
+steps/info/nnet3_dir_info.pl $dir
 
 exit 0
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index d3abb82c92c..7e876bda1ed 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -1,5 +1,10 @@
 #!/usr/bin/env python
 
+# Copyright 2016    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import argparse
@@ -170,7 +175,24 @@ def write_config_files(config_dir, all_layers):
             # preserves the backtrace
             raise
 
+    # remove previous init.config
+    try:
+        os.remove(config_dir + '/init.config')
+    except OSError:
+        pass
+
     for basename, lines in config_basename_to_lines.items():
+        # check the lines num start with 'output-node':
+        num_output_node_lines = sum( [ 1 if line.startswith('output-node' ) else 0
+                                       for line in lines ] )
+        if num_output_node_lines == 0:
+            if basename == 'init':
+                continue # do not write the init.config
+            else:
+                print('{0}: error in xconfig file {1}: may be lack of a output layer'.format(
+                    sys.argv[0], sys.argv[2]), file=sys.stderr)
+                raise
+
         header = config_basename_to_header[basename]
         filename = '{0}/{1}.config'.format(config_dir, basename)
         try:
@@ -219,6 +241,41 @@ def add_back_compatibility_info(config_dir):
     common_lib.force_symlink("final.config".format(config_dir),
                              "{0}/layer1.config".format(config_dir))
 
+def check_model_contexts(config_dir):
+    contexts = {}
+    for file_name in ['init', 'ref']:
+        if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)):
+            contexts[file_name] = {}
+            common_lib.run_kaldi_command("nnet3-init {0}/{1}.config "
+                                         "{0}/{1}.raw".format(config_dir, file_name))
+            out, err = common_lib.run_kaldi_command("nnet3-info {0}/{1}.raw | "
+                                                    "head -4".format(config_dir, file_name))
+            # out looks like this
+            # left-context: 7
+            # right-context: 0
+            # num-parameters: 90543902
+            # modulus: 1
+            for line in out.split("\n"):
+                parts = line.split(":")
+                if len(parts) != 2:
+                    continue
+                key = parts[0].strip()
+                value = int(parts[1].strip())
+                if key in ['left-context', 'right-context']:
+                    contexts[file_name][key] = value
+
+    if contexts.has_key('init'):
+        assert(contexts.has_key('ref'))
+        if ((contexts['init']['left-context'] > contexts['ref']['left-context'])
+           or (contexts['init']['right-context'] > contexts['ref']['right-context'])):
+           raise Exception("Model specified in {0}/init.config requires greater"
+                           " context than the model specified in {0}/ref.config."
+                           " This might be due to use of label-delay at the output"
+                           " in ref.config. Please use delay=$label_delay in the"
+                           " initial fixed-affine-layer of the network, to avoid"
+                           " this issue.")
+
+
 
 def main():
     args = get_args()
@@ -226,6 +283,7 @@ def main():
     all_layers = xparser.read_xconfig_file(args.xconfig_file)
     write_expanded_xconfig_files(args.config_dir, all_layers)
     write_config_files(args.config_dir, all_layers)
+    check_model_contexts(args.config_dir)
     add_back_compatibility_info(args.config_dir)
 
 
diff --git a/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh b/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh
new file mode 100755
index 00000000000..b70e5cf21ad
--- /dev/null
+++ b/egs/wsj/s5/steps/online/nnet2/copy_ivector_dir.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2017  Johns Hopkins University (author: Hossein Hadian)
+# Apache 2.0
+
+# This script copies the necessary parts of an online ivector directory
+# optionally applying a mapping to the ivector_online.scp file
+
+utt2orig=
+
+. utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0 exp/nnet3/online_ivector_train exp/nnet3/online_ivector_train_fs"
+  echo "Options"
+  echo "   --utt2orig=<file>     # utterance id mapping to use"
+  exit 1;
+fi
+
+
+srcdir=$1
+destdir=$2
+
+if [ ! -f $srcdir/ivector_period ]; then
+  echo "$0: no such file $srcdir/ivector_period"
+  exit 1;
+fi
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+set -e;
+
+mkdir -p $destdir
+cp -r $srcdir/{conf,ivector_period} $destdir
+if [ -z $utt2orig ]; then
+  cp $srcdir/ivector_online.scp $destdir
+else
+  utils/apply_map.pl -f 2 $srcdir/ivector_online.scp < $utt2orig > $destdir/ivector_online.scp
+fi
+cp $srcdir/final.ie.id $destdir
+
+echo "$0: Copied necessary parts of online ivector directory $srcdir to $destdir"
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
index f27baecd673..53026b840bd 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh
@@ -277,4 +277,7 @@ if [ $stage -le 5 ]; then
   for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
 fi
 
-echo "$0: done extracting (pseudo-online) iVectors"
+steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1
+
+echo "$0: done extracting (pseudo-online) iVectors to $dir using the extractor in $srcdir."
+
diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
index b52de1f516b..f4d908e9446 100755
--- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
+++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh
@@ -127,3 +127,8 @@ if [ $stage -le 1 ]; then
   echo "$0: combining iVectors across jobs"
   for j in $(seq $nj); do cat $dir/ivector_online.$j.scp; done >$dir/ivector_online.scp || exit 1;
 fi
+
+steps/nnet2/get_ivector_id.sh $srcdir > $dir/final.ie.id || exit 1
+
+echo "$0: done extracting (online) iVectors to $dir using the extractor in $srcdir."
+
diff --git a/egs/wsj/s5/steps/online/nnet2/get_egs.sh b/egs/wsj/s5/steps/online/nnet2/get_egs.sh
index 5f055b4b680..73f3581ade9 100755
--- a/egs/wsj/s5/steps/online/nnet2/get_egs.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs.sh
@@ -2,7 +2,7 @@
 
 # Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 
-# This is modified from ../../nnet2/get_egs.sh. 
+# This is modified from ../../nnet2/get_egs.sh.
 # This script combines the
 # nnet-example extraction with the feature extraction directly from wave files;
 # it uses the program online2-wav-dump-feature to do all parts of feature
@@ -24,7 +24,7 @@ samples_per_iter=400000 # each iteration of training, see this many samples
 transform_dir=     # If supplied, overrides alidir
 num_jobs_nnet=16    # Number of neural net jobs to run in parallel
 stage=0
-io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. 
+io_opts="--max-jobs-run 5" # for jobs with a lot of I/O, limits the number running at one time.
 random_copy=false
 
 echo "$0 $@"  # Print the command line for logging
@@ -56,7 +56,7 @@ if [ $# != 4 ]; then
   echo "                                                   # very end."
   echo "  --stage <stage|0>                                # Used to run a partially-completed training process from somewhere in"
   echo "                                                   # the middle."
-  
+
   exit 1;
 fi
 
@@ -82,7 +82,7 @@ mkdir -p $dir/log
 cp $alidir/tree $dir
 grep -v '^--endpoint' $feature_conf >$dir/feature.conf || exit 1;
 
-# Get list of validation utterances. 
+# Get list of validation utterances.
 mkdir -p $dir/valid $dir/train_subset
 
 awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
@@ -175,7 +175,7 @@ mkdir -p $dir/egs
 
 if [ $stage -le 2 ]; then
   rm $dir/.error 2>/dev/null
-  
+
   echo "$0: extracting validation and training-subset alignments."
   set -o pipefail;
   for id in $(seq $nj); do gunzip -c $alidir/ali.$id.gz; done | \
@@ -183,7 +183,7 @@ if [ $stage -le 2 ]; then
     utils/filter_scp.pl <(cat $dir/valid/uttlist $dir/train_subset/uttlist) | \
     gzip -c >$dir/ali_special.gz || exit 1;
   set +o pipefail; # unset the pipefail option.
-  
+
   echo "Getting validation and training subset examples."
   $cmd $dir/log/create_valid_subset.log \
     nnet-get-egs $ivectors_opt $nnet_context_opts "$valid_feats" \
@@ -252,7 +252,7 @@ if [ $stage -le 4 ]; then
     echo "$0: Since iters-per-epoch == 1, just concatenating the data."
     for n in `seq 1 $num_jobs_nnet`; do
       cat $dir/egs/egs_orig.$n.*.ark > $dir/egs/egs_tmp.$n.0.ark || exit 1;
-      remove $dir/egs/egs_orig.$n.*.ark 
+      remove $dir/egs/egs_orig.$n.*.ark
     done
   else # We'll have to split it up using nnet-copy-egs.
     egs_list=
@@ -277,7 +277,7 @@ if [ $stage -le 5 ]; then
   for n in `seq 0 $[$iters_per_epoch-1]`; do
     $cmd $io_opts JOB=1:$num_jobs_nnet $dir/log/shuffle.$n.JOB.log \
       nnet-shuffle-egs "--srand=\$[JOB+($num_jobs_nnet*$n)]" \
-      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark 
+      ark:$dir/egs/egs_tmp.JOB.$n.ark ark:$dir/egs/egs.JOB.$n.ark
     remove $dir/egs/egs_tmp.*.$n.ark
   done
 fi
diff --git a/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh b/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
index c667e4a9284..178db7ee4c7 100755
--- a/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
+++ b/egs/wsj/s5/steps/online/nnet2/get_egs_discriminative2.sh
@@ -37,7 +37,7 @@ if [ $# != 6 ]; then
   echo ""
   echo "Main options (for others, see top of script file)"
   echo "  --config <config-file>                           # config file containing options"
-  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add -tc 5 or so if using"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs (probably would be good to add --max-jobs-run 5 or so if using"
   echo "                                                   # GridEngine (to avoid excessive NFS traffic)."
   echo "  --samples-per-iter <#samples|400000>             # Number of samples of data to process per iteration, per"
   echo "                                                   # process."
diff --git a/egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh b/egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh
new file mode 100755
index 00000000000..e0b704f8852
--- /dev/null
+++ b/egs/wsj/s5/steps/online/nnet2/get_pca_transform.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2016  David Snyder
+#
+# This script computes a PCA transform on top of spliced features processed with
+# apply-cmvn-online.
+#
+#
+# Apache 2.0.
+
+# Begin configuration.
+cmd=run.pl
+config=
+stage=0
+dim=40 # The dim after applying PCA
+normalize_variance=true # If the PCA transform normalizes the variance
+normalize_mean=true # If the PCA transform centers
+splice_opts=
+online_cmvn_opts=
+max_utts=5000 # maximum number of files to use
+subsample=5 # subsample features with this periodicity
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: steps/nnet2/get_pca_transform.sh [options] <data> <dir>"
+  echo " e.g.: steps/train_pca_transform.sh data/train_si84 exp/tri2b"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+
+data=$1
+dir=$2
+
+for f in $data/feats.scp ; do
+  [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
+done
+
+mkdir -p $dir/log
+
+echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options
+           # so that later stages of system building can know what they were.
+echo $online_cmvn_opts > $dir/online_cmvn.conf # keep track of options to CMVN.
+
+# create global_cmvn.stats
+if ! matrix-sum --binary=false scp:$data/cmvn.scp - >$dir/global_cmvn.stats 2>/dev/null; then
+  echo "$0: Error summing cmvn stats"
+  exit 1
+fi
+
+feats="ark,s,cs:utils/subset_scp.pl --quiet $max_utts $data/feats.scp | apply-cmvn-online $online_cmvn_opts $dir/global_cmvn.stats scp:- ark:- | splice-feats $splice_opts ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
+
+if [ $stage -le 0 ]; then
+  $cmd $dir/log/pca_est.log \
+    est-pca --dim=$dim --normalize-variance=$normalize_variance \
+    --normalize-mean=$normalize_mean "$feats" $dir/final.mat || exit 1;
+fi
+
+echo "Done estimating PCA transform in $dir"
+
+exit 0
diff --git a/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh b/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
index 22250ae9ee3..80a023fed8a 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_diag_ubm.sh
@@ -10,15 +10,15 @@
 
 # This script was modified from ../../sre08/v1/sid/train_diag_ubm.sh.  It trains
 # a diagonal UBM on top of features processed with apply-cmvn-online and then
-# transformed with an LDA+MLLT matrix (obtained from the source directory).
-# This script does not use the trained model from the source directory to
-# initialize the diagonal GMM; instead, we initialize the GMM using
+# transformed with an LDA+MLLT or PCA matrix (obtained from the source
+# directory).  This script does not use the trained model from the source
+# directory to initialize the diagonal GMM; instead, we initialize the GMM using
 # gmm-global-init-from-feats, which sets the means to random data points and
 # then does some iterations of E-M in memory.  After the in-memory
-# initialization we train for a few iterations in parallel.
-# Note that there is a slight mismatch in that the source LDA+MLLT matrix
-# (final.mat) will have been estimated using standard CMVN, and we're using
-# online CMVN.  We don't think this will have much effect.
+# initialization we train for a few iterations in parallel.  Note that if an
+# LDA+MLLT transform matrix is used, there will be a slight mismatch in that the
+# source LDA+MLLT matrix (final.mat) will have been estimated using standard
+# CMVN, and we're using online CMVN.  We don't think this will have much effect.
 
 
 # Begin configuration section.
@@ -58,7 +58,7 @@ if [ $# != 4 ]; then
   echo "  --stage <stage|-2>                               # stage to do partial re-run from."
   echo "  --num-gselect <n|30>                             # Number of Gaussians per frame to"
   echo "                                                   # limit computation to, for speed"
-  echo " --subsample <n|5>                                 # In main E-M phase, use every n" 
+  echo " --subsample <n|5>                                 # In main E-M phase, use every n"
   echo "                                                   # frames (a speedup)"
   echo "  --num-frames <n|500000>                          # Maximum num-frames to keep in memory"
   echo "                                                   # for model initialization"
@@ -89,6 +89,15 @@ for f in $data/feats.scp "$online_cmvn_config" $srcdir/splice_opts $srcdir/final
    [ ! -f "$f" ] && echo "$0: expecting file $f to exist" && exit 1
 done
 
+if [ -d "$dir" ]; then
+  bak_dir=$(mktemp -d ${dir}/backup.XXX);
+  echo "$0: Directory $dir already exists. Backing up diagonal UBM in ${bak_dir}";
+  for f in $dir/final.mat $dir/final.dubm $dir/online_cmvn.conf $dir/global_cmvn.stats; do
+    [ -f "$f" ] && mv $f ${bak_dir}/
+  done
+  [ -d "$dir/log" ] && mv $dir/log ${bak_dir}/
+fi
+
 splice_opts=$(cat $srcdir/splice_opts)
 cp $srcdir/splice_opts $dir/ || exit 1;
 cp $srcdir/final.mat $dir/ || exit 1;
@@ -146,10 +155,16 @@ for x in `seq 0 $[$num_iters-1]`; do
     $cmd $dir/log/update.$x.log \
       gmm-global-est $opt --min-gaussian-weight=$min_gaussian_weight $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc|" \
       $dir/$[$x+1].dubm || exit 1;
-    rm $dir/$x.*.acc $dir/$x.dubm
+
+    if $cleanup; then
+      rm $dir/$x.*.acc $dir/$x.dubm
+    fi
   fi
 done
 
-rm $dir/gselect.*.gz
+if $cleanup; then
+  rm $dir/gselect.*.gz
+fi
+
 mv $dir/$num_iters.dubm $dir/final.dubm || exit 1;
 exit 0;
diff --git a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
index 9b354c0753e..5dbda1780f4 100755
--- a/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
+++ b/egs/wsj/s5/steps/online/nnet2/train_ivector_extractor.sh
@@ -21,7 +21,7 @@
 #  - Set num_threads to the minimum of (4, or how many virtual cores your machine has).
 #    (because of needing to lock various global quantities, the program can't
 #    use many more than 4 threads with good CPU utilization).
-#  - Set num_processes to the number of virtual cores on each machine you have, divided by 
+#  - Set num_processes to the number of virtual cores on each machine you have, divided by
 #    num_threads.  E.g. 4, if you have 16 virtual cores.   If you're on a shared queue
 #    that's busy with other people's jobs, it may be wise to set it to rather less
 #    than this maximum though, or your jobs won't get scheduled.  And if memory is
@@ -32,8 +32,8 @@
 #    may want more jobs, though.
 
 # Begin configuration section.
-nj=10   # this is the number of separate queue jobs we run, but each one 
-        # contains num_processes sub-jobs.. the real number of threads we 
+nj=10   # this is the number of separate queue jobs we run, but each one
+        # contains num_processes sub-jobs.. the real number of threads we
         # run is nj * num_processes * num_threads, and the number of
         # separate pieces of data is nj * num_processes.
 num_threads=4
@@ -88,6 +88,17 @@ for f in $srcdir/final.dubm $srcdir/final.mat $srcdir/global_cmvn.stats $srcdir/
   [ ! -f $f ] && echo "No such file $f" && exit 1;
 done
 
+
+if [ -d "$dir" ]; then
+  bak_dir=$(mktemp -d ${dir}/backup.XXX);
+  echo "$0: Directory $dir already exists. Backing up iVector extractor in ${bak_dir}";
+  for f in $dir/final.ie $dir/*.ie $dir/final.mat $dir/final.dubm \
+        $dir/online_cmvn.conf $dir/global_cmvn.stats; do
+    [ -f "$f" ] &&  mv $f ${bak_dir}/
+  done
+  [ -d "$dir/log" ] && mv $dir/log ${bak_dir}/
+fi
+
 # Set various variables.
 mkdir -p $dir/log
 nj_full=$[$nj*$num_processes]
@@ -105,7 +116,6 @@ gmm_feats="ark,s,cs:apply-cmvn-online --config=$dir/online_cmvn.conf $dir/global
 feats="ark,s,cs:splice-feats $splice_opts scp:$sdata/JOB/feats.scp ark:- | transform-feats $dir/final.mat ark:- ark:- | subsample-feats --n=$subsample ark:- ark:- |"
 
 
-
 # Initialize the i-vector extractor using the input GMM, which is converted to
 # full because that's what the i-vector extractor expects.  Note: we have to do
 # --use-weights=false to disable regression of the log weights on the ivector,
@@ -115,7 +125,7 @@ if [ $stage -le -2 ]; then
   $cmd $dir/log/init.log \
     ivector-extractor-init --ivector-dim=$ivector_dim --use-weights=false \
      "gmm-global-to-fgmm $dir/final.dubm -|" $dir/0.ie || exit 1
-fi 
+fi
 
 # Do Gaussian selection and posterior extracion
 
@@ -168,19 +178,27 @@ while [ $x -lt $num_iters ]; do
                                       # each accumulation process uses, since we
                                       # can be sure the queue will support this many.
                                       #
-                                      # The parallel-opts was either specified by 
+                                      # The parallel-opts was either specified by
                                       # the user or we computed it correctly in
                                       # tge previous stages
 	$cmd --num-threads $[$num_threads*$num_processes] $dir/log/update.$x.log \
 	  ivector-extractor-est --num-threads=$nt $dir/$x.ie $dir/acc.$x $dir/$[$x+1].ie || exit 1;
 	rm $dir/acc.$x.*
     if $cleanup; then
-      rm $dir/acc.$x
-      # rm $dir/$x.ie
+      rm $dir/acc.$x $dir/$x.ie
     fi
   fi
   x=$[$x+1]
 done
 
+if $cleanup; then
+  rm $dir/post.*.gz
+fi
+
 rm $dir/final.ie 2>/dev/null
 ln -s $x.ie $dir/final.ie
+
+# assign a unique id to this extractor
+# we are not interested in the id itself, just pre-caching ...
+steps/nnet2/get_ivector_id.sh $dir > /dev/null || exit 1
+
diff --git a/egs/wsj/s5/steps/online/nnet3/decode.sh b/egs/wsj/s5/steps/online/nnet3/decode.sh
index a4777f1edf7..118cf9e1260 100755
--- a/egs/wsj/s5/steps/online/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/online/nnet3/decode.sh
@@ -8,6 +8,8 @@
 stage=0
 nj=4
 cmd=run.pl
+frames_per_chunk=20
+extra_left_context_initial=0
 min_active=200
 max_active=7000
 beam=15.0
@@ -114,11 +116,6 @@ else
 fi
 
 
-decoder=online2-wav-nnet3-latgen-faster
-parallel_opts=
-opts="--online=$online"
-
-
 if [ "$post_decode_acwt" == 1.0 ]; then
   lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz"
 else
@@ -132,8 +129,12 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
 fi
 
 if [ $stage -le 0 ]; then
-  $cmd $parallel_opts JOB=1:$nj $dir/log/decode.JOB.log \
-    $decoder $opts $silence_weighting_opts --do-endpointing=$do_endpointing $frame_subsampling_opt \
+  $cmd JOB=1:$nj $dir/log/decode.JOB.log \
+    online2-wav-nnet3-latgen-faster $silence_weighting_opts --do-endpointing=$do_endpointing \
+    --frames-per-chunk=$frames_per_chunk \
+    --extra-left-context-initial=$extra_left_context_initial \
+    --online=$online \
+       $frame_subsampling_opt \
      --config=$online_config \
      --min-active=$min_active --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \
      --acoustic-scale=$acwt --word-symbol-table=$graphdir/words.txt \
diff --git a/egs/wsj/s5/steps/shift_feats.sh b/egs/wsj/s5/steps/shift_feats.sh
index 9ad85368c3f..ada5716f187 100755
--- a/egs/wsj/s5/steps/shift_feats.sh
+++ b/egs/wsj/s5/steps/shift_feats.sh
@@ -3,11 +3,15 @@
 # Copyright 2016    Vimal Manohar
 # Apache 2.0
 
+# This script is deprecated. The newer script utils/data/shift_feats.sh
+# should be used instead.
+
 # This script shifts the feats in the input data directory and creates a
 # new directory <input-data>_fs<num-frames-shift> with shifted feats.
-# If the shift is negative, the initial frames get truncated.
-# If the shift is positive, the first frame is repeated.
-# Usually applicable for sequence training
+# If the shift is negative, the initial frames get truncated and the
+# last frame repeated; if positive, vice versa.
+# Used to prepare data for sequence training of models with
+# frame_subsampling_factor != 1 (e.g. chain models).
 
 # To be run from .. (one directory up from here)
 # see ../run.sh for example
@@ -24,6 +28,8 @@ if [ -f path.sh ]; then . ./path.sh; fi
 . parse_options.sh || exit 1;
 
 if [ $# -ne 4 ]; then
+   echo "This script is deprecated. The newer script utils/data/shift_feats.sh"
+   echo "should be used instead."
    echo "usage: $0 [options] <frame-shift> <src-data-dir> <log-dir> <path-to-storage-dir>";
    echo "e.g.: $0 -1 data/train exp/shift-1_train mfcc"
    echo "options: "
@@ -82,4 +88,3 @@ if [ $nf -ne $nu ]; then
 fi
 
 echo "Succeeded shifting features for $name into $data"
-
diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh
index 8b5e19ec8d1..363df34a3cd 100755
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@@ -95,7 +95,7 @@ feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
 
 if [ $stage -le -5 ]; then
   if [ -z "$use_lda_mat" ]; then
-    echo "Accumulating LDA statistics."
+    echo "$0: Accumulating LDA statistics."
     rm $dir/lda.*.acc 2>/dev/null
     $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
     ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \
@@ -106,11 +106,11 @@ if [ $stage -le -5 ]; then
       2>$dir/log/lda_est.log || exit 1;
     rm $dir/lda.*.acc
   else
-    echo "Using supplied LDA matrix $use_lda_mat"
+    echo "$0: Using supplied LDA matrix $use_lda_mat"
     cp $use_lda_mat $dir/0.mat || exit 1;
     [ ! -z "$mllt_iters" ] && \
-      echo "Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
-      echo "which you might not want; to disable MLLT, specify --mllt-iters ''" && \
+      echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \
+      echo "     which you might not want; to disable MLLT, specify --mllt-iters ''" && \
       sleep 5
   fi
 fi
@@ -118,12 +118,12 @@ fi
 cur_lda_iter=0
 
 if [ $stage -le -4 ] && $train_tree; then
-  echo "Accumulating tree stats"
+  echo "$0: Accumulating tree stats"
   $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
     acc-tree-stats $context_opts \
     --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
     "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
-  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "Wrong #tree-accs" && exit 1;
+  [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1;
   $cmd $dir/log/sum_tree_acc.log \
     sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
   rm $dir/*.treeacc
@@ -131,7 +131,7 @@ fi
 
 
 if [ $stage -le -3 ] && $train_tree; then
-  echo "Getting questions for tree clustering."
+  echo "$0: Getting questions for tree clustering."
   # preparing questions, roots file...
   cluster-phones $context_opts $dir/treeacc $lang/phones/sets.int \
     $dir/questions.int 2> $dir/log/questions.log || exit 1;
@@ -139,7 +139,7 @@ if [ $stage -le -3 ] && $train_tree; then
   compile-questions $context_opts $lang/topo $dir/questions.int \
     $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
 
-  echo "Building the tree"
+  echo "$0: Building the tree"
   $cmd $dir/log/build_tree.log \
     build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
     --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
@@ -164,14 +164,14 @@ fi
 
 if [ $stage -le -1 ]; then
   # Convert the alignments.
-  echo "Converting alignments from $alidir to use current tree"
+  echo "$0: Converting alignments from $alidir to use current tree"
   $cmd JOB=1:$nj $dir/log/convert.JOB.log \
     convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
      "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
 fi
 
 if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then
-  echo "Compiling graphs of transcripts"
+  echo "$0: Compiling graphs of transcripts"
   $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
     compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl  $lang/L.fst  \
      "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
@@ -192,7 +192,7 @@ while [ $x -lt $num_iters ]; do
   fi
   if echo $mllt_iters | grep -w $x >/dev/null; then
     if [ $stage -le $x ]; then
-      echo "Estimating MLLT"
+      echo "$0: Estimating MLLT"
       $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
         ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \
         weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
@@ -233,6 +233,6 @@ utils/summarize_warnings.pl $dir/log
 
 steps/info/gmm_dir_info.pl $dir
 
-echo "Done training system with LDA+MLLT features in $dir"
+echo "$0: Done training system with LDA+MLLT features in $dir"
 
 exit 0
diff --git a/egs/wsj/s5/utils/convert_slf_parallel.sh b/egs/wsj/s5/utils/convert_slf_parallel.sh
index 4e4ce41d236..1b242ed2c38 100755
--- a/egs/wsj/s5/utils/convert_slf_parallel.sh
+++ b/egs/wsj/s5/utils/convert_slf_parallel.sh
@@ -7,7 +7,7 @@
 # begin configuration section.
 cmd=run.pl
 dirname=lats-in-htk-slf
-parallel_opts="-tc 50" # We should limit disk stress
+parallel_opts="--max-jobs-run 50" # We should limit disk stress
 word_to_node=false # Words in arcs or nodes? [default:arcs]
 #end configuration section.
 
@@ -21,7 +21,7 @@ if [ $# -ne 3 ]; then
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --word-to-link (true|false)     # put word symbols on links or nodes."
-  echo "    --parallel-opts STR             # parallelization options (def.: '-tc 50')."
+  echo "    --parallel-opts STR             # parallelization options (def.: '--max-jobs-run 50')."
   echo "e.g.:"
   echo "$0 data/dev data/lang exp/tri4a/decode_dev"
   exit 1;
diff --git a/egs/wsj/s5/utils/data/limit_feature_dim.sh b/egs/wsj/s5/utils/data/limit_feature_dim.sh
index 4e64e68d7c7..2d969ee569b 100755
--- a/egs/wsj/s5/utils/data/limit_feature_dim.sh
+++ b/egs/wsj/s5/utils/data/limit_feature_dim.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-77;20003;0c
+
 # Copyright 2016  Alibaba Robotics Corp. (author: Xingyu Na)
 # Apache 2.0
 
diff --git a/egs/wsj/s5/utils/data/shift_and_combine_feats.sh b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
new file mode 100755
index 00000000000..217b7768078
--- /dev/null
+++ b/egs/wsj/s5/utils/data/shift_and_combine_feats.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2017  Hossein Hadian
+
+# Apache 2.0
+
+write_utt2orig=              # if provided, this script will write
+                             # a mapping of shifted utterance ids
+                             # to the original ones into the file
+                             # specified by this option
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <frame-subsampling-factor> <srcdir> <destdir>"
+  echo "e.g.: $0 3 data/train data/train_fs3"
+  echo "For use in perturbing data for discriminative training and alignment of"
+  echo "frame-subsampled systems, this script uses utils/data/shift_feats.sh"
+  echo "and utils/data/combine_data.sh to shift the features"
+  echo "<frame-subsampling-factor> different ways and combine them."
+  echo "E.g. if <frame-subsampling-factor> is 3, this script will combine"
+  echo "the data frame-shifted by -1, 0 and 1 (c.f. shift-feats)."
+  exit 1
+fi
+
+frame_subsampling_factor=$1
+srcdir=$2
+destdir=$3
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: expected $srcdir/feats.scp to exist"
+  exit 1
+fi
+
+if [ -f $destdir/feats.scp ]; then
+  echo "$0: $destdir/feats.scp already exists: refusing to run this (please delete $destdir/feats.scp if you want this to run)"
+  exit 1
+fi
+
+if [ ! -z $write_utt2orig ]; then
+  awk '{print $1 " " $1}' $srcdir/feats.scp >$write_utt2orig
+fi
+
+tmp_shift_destdirs=()
+for frame_shift in `seq $[-(frame_subsampling_factor/2)] $[-(frame_subsampling_factor/2) + frame_subsampling_factor - 1]`; do
+  if [ "$frame_shift" == 0 ]; then continue; fi
+  utils/data/shift_feats.sh $frame_shift $srcdir ${destdir}_fs$frame_shift || exit 1
+  tmp_shift_destdirs+=("${destdir}_fs$frame_shift")
+  if [ ! -z $write_utt2orig ]; then
+    awk -v prefix="fs$frame_shift-" '{printf("%s%s %s\n", prefix, $1, $1);}' $srcdir/feats.scp >>$write_utt2orig
+  fi  
+done
+utils/data/combine_data.sh $destdir $srcdir ${tmp_shift_destdirs[@]} || exit 1
+rm -r ${tmp_shift_destdirs[@]}
+
+utils/validate_data_dir.sh $destdir
+
+src_nf=`cat $srcdir/feats.scp | wc -l`
+dest_nf=`cat $destdir/feats.scp | wc -l`
+if [ $[src_nf*frame_subsampling_factor] -ne $dest_nf ]; then
+  echo "There was a problem. Expected number of feature lines in destination dir to be $[src_nf*frame_subsampling_factor];"
+  exit 1;
+fi
+
+echo "$0: Successfully generated $frame_subsampling_factor-way shifted version of data in $srcdir, in $destdir"
diff --git a/egs/wsj/s5/utils/data/shift_feats.sh b/egs/wsj/s5/utils/data/shift_feats.sh
new file mode 100755
index 00000000000..2ae7b2435d3
--- /dev/null
+++ b/egs/wsj/s5/utils/data/shift_feats.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2016    Vimal Manohar
+#           2017    Hossein Hadian
+# Apache 2.0
+
+echo "$0 $@"  # Print the command line for logging
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo " Usage: $0 <frame-shift> <srcdir> <destdir>"
+  echo "e.g.: $0 -1 data/train data/train_fs-1"
+  echo "The script creates a new data directory with the features modified"
+  echo "using the program shift-feats with the specified frame-shift."
+  echo "This program automatically adds the prefix 'fs<frame-shift>-' to the"
+  echo "utterance and speaker names. See also utils/data/shift_and_combine_feats.sh"
+  exit 1
+fi
+
+frame_shift=$1
+srcdir=$2
+destdir=$3
+
+
+if [ "$destdir" == "$srcdir" ]; then
+  echo "$0: this script requires <srcdir> and <destdir> to be different."
+  exit 1
+fi
+
+if [ ! -f $srcdir/feats.scp ]; then
+  echo "$0: no such file $srcdir/feats.scp"
+  exit 1;
+fi
+
+utt_prefix="fs$frame_shift-"
+spk_prefix="fs$frame_shift-"
+
+mkdir -p $destdir
+utils/copy_data_dir.sh --utt-prefix $utt_prefix --spk-prefix $spk_prefix \
+  $srcdir $destdir
+
+if grep --quiet "'" $srcdir/feats.scp; then
+  echo "$0: the input features already use single quotes. Can't proceed."
+  exit 1;
+fi
+
+awk -v shift=$frame_shift 'NF == 2 {uttid=$1; feat=$2; qt="";} \
+NF > 2 {idx=index($0, " "); uttid=$1; feat=substr($0, idx + 1); qt="\x27";} \
+NF {print uttid " shift-feats --print-args=false --shift=" shift, qt feat qt " - |";}' \
+  $destdir/feats.scp >$destdir/feats_shifted.scp
+mv -f $destdir/feats_shifted.scp $destdir/feats.scp
+
+echo "$0: Done"
+
diff --git a/egs/wsj/s5/utils/filter_scps.pl b/egs/wsj/s5/utils/filter_scps.pl
index 0d9e0fe4837..418f8f73e1b 100755
--- a/egs/wsj/s5/utils/filter_scps.pl
+++ b/egs/wsj/s5/utils/filter_scps.pl
@@ -165,6 +165,6 @@
   print STDERR "filter_scps.pl: warning: some input lines did not get output\n";
 }
 if ($warn_multiply_covered && $print_warnings) {
-  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files\n";
+  print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt] " .
+    join(" ", @ARGV) . "\n";
 }
-
diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh
index 0333d628544..cbbcbe8f8c4 100755
--- a/egs/wsj/s5/utils/fix_data_dir.sh
+++ b/egs/wsj/s5/utils/fix_data_dir.sh
@@ -22,12 +22,13 @@ mkdir -p $data/.backup
 
 [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
 
+set -e -o pipefail -u
+
 tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
 trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
 
 export LC_ALL=C
 
-
 function check_sorted {
   file=$1
   sort -k1,1 -u <$file >$file.tmp
@@ -54,8 +55,8 @@ function filter_file {
   cp $file_to_filter ${file_to_filter}.tmp
   utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
   if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
-    length1=`cat ${file_to_filter}.tmp | wc -l`
-    length2=`cat ${file_to_filter} | wc -l`
+    length1=$(cat ${file_to_filter}.tmp | wc -l)
+    length2=$(cat ${file_to_filter} | wc -l)
     if [ $length1 -ne $length2 ]; then
       echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
     fi
@@ -77,7 +78,7 @@ function filter_recordings {
       exit 1;
     fi
     awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
-    n1=`cat $tmpdir/recordings | wc -l`
+    n1=$(cat $tmpdir/recordings | wc -l)
     [ ! -s $tmpdir/recordings ] && \
       echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
     utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
@@ -91,7 +92,7 @@ function filter_recordings {
 
     filter_file $tmpdir/recordings $data/wav.scp
     [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
-
+    true
   fi
 }
 
diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh
index 1acacf7ae89..4ef31d925ca 100755
--- a/egs/wsj/s5/utils/format_lm_sri.sh
+++ b/egs/wsj/s5/utils/format_lm_sri.sh
@@ -61,20 +61,9 @@ done
 
 loc=`which change-lm-vocab`
 if [ -z $loc ]; then
-  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
-    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
-  else
-    sdir=`pwd`/../../../tools/srilm/bin/i686
-  fi
-  if [ -f $sdir/../change-lm-vocab ]; then
-    echo Using SRILM tools from $sdir
-    export PATH=$PATH:$sdir:$sdir/..
-  else
-    echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  cd to ../../../tools and run
-    echo extras/install_srilm.sh.
-    exit 1
-  fi
+  echo You appear to not have SRILM tools installed.
+  echo cd to $KALDI_ROOT/tools and run extras/install_srilm.sh.
+  exit 1
 fi
 
 echo "Converting '$lm' to FST"
diff --git a/egs/wsj/s5/utils/mkgraph.sh b/egs/wsj/s5/utils/mkgraph.sh
index 42204b85e7d..65ff3c3c79d 100755
--- a/egs/wsj/s5/utils/mkgraph.sh
+++ b/egs/wsj/s5/utils/mkgraph.sh
@@ -75,7 +75,7 @@ fi
 N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; }
 P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; }
 
-[[ -f $2/frame_subsampling_factor && $loopscale != 1.0 ]] && \
+[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \
   echo "$0: WARNING: chain models need '--self-loop-scale 1.0'";
 
 mkdir -p $lang/tmp
diff --git a/egs/wsj/s5/utils/queue.pl b/egs/wsj/s5/utils/queue.pl
index 69188ec074a..10fd3b1a885 100755
--- a/egs/wsj/s5/utils/queue.pl
+++ b/egs/wsj/s5/utils/queue.pl
@@ -91,10 +91,11 @@ ()
 }
 
 sub caught_signal {
-	if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
-		system ("qdel $sge_job_id");
-		die "Caught a signal: $! , deleting SGE task: $sge_job_id and exiting\n";
-	}
+  if ( defined $sge_job_id ) { # Signal trapped after submitting jobs
+    my $signal = $!;
+    system ("qdel $sge_job_id");
+    die "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n";
+  }
 }
 
 if (@ARGV < 2) {
diff --git a/egs/wsj/s5/utils/split_data.sh b/egs/wsj/s5/utils/split_data.sh
index e44a4ab6359..ab0dbbf35c7 100755
--- a/egs/wsj/s5/utils/split_data.sh
+++ b/egs/wsj/s5/utils/split_data.sh
@@ -41,6 +41,14 @@ if ! [ "$numsplit" -gt 0 ]; then
   exit 1;
 fi
 
+if $split_per_spk; then
+  warning_opt=
+else
+  # suppress warnings from filter_scps.pl about 'some input lines were output
+  # to multiple files'.
+  warning_opt="--no-warn"
+fi
+
 n=0;
 feats=""
 wavs=""
@@ -124,9 +132,6 @@ done
 # split some things that are indexed by speaker
 for f in spk2gender spk2warp cmvn.scp; do
   if [ -f $data/$f ]; then
-    ! $split_per_spk && warning_opt="--no-warn"
-    # suppress warnings from filter_scps.pl about 'some input lines were output
-    # to multiple files', which is expected in this case.
     utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/spk2utt $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1;
   fi
@@ -140,12 +145,12 @@ if [ -f $data/segments ]; then
     awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids.
   done
   if [ -f $data/reco2file_and_channel ]; then
-    utils/filter_scps.pl JOB=1:$numsplit \
+    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2file_and_channel \
       $data/split${numsplit}${utt}/JOB/reco2file_and_channel || exit 1
   fi
   if [ -f $data/wav.scp ]; then
-    utils/filter_scps.pl JOB=1:$numsplit \
+    utils/filter_scps.pl $warning_opt JOB=1:$numsplit \
       $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \
       $data/split${numsplit}${utt}/JOB/wav.scp || exit 1
   fi
diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 49c929207b9..7e93b0f8400 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -22,6 +22,8 @@ done
 
 if [ $# -ne 1 ]; then
   echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] <data-dir>"
+  echo "The --no-xxx options mean that the script does not require "
+  echo "xxx.scp to be present, but it will check it if it is present."
   echo "e.g.: $0 data/train"
   exit 1;
 fi
@@ -132,7 +134,7 @@ if [ -f $data/wav.scp ]; then
     check_sorted_and_uniq $data/segments
     # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids.
     ! cat $data/segments | \
-      awk '{if (NF != 4 || ($4 <= $3 && $4 != -1)) { print "Bad line in segments file", $0; exit(1); }}' && \
+      awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \
       echo "$0: badly formatted segments file" && exit 1;
 
     segments_len=`cat $data/segments | wc -l`
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index 008c54ac752..2e8125b1dd7 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -758,8 +758,10 @@ sub check_summation {
       # prepare_lang.sh), the regular L.fst may contain some disambiguation
       # symbols.
       if (! defined $is_disambig{$phone}) {
-        if ($phone == "<<eos>>") {
+        if ($phone eq "<<eos>>") {
           $state = "eos";
+        } elsif ($phone == 0) {
+          $exit = 1; print "--> ERROR: unexpected phone sequence=$phoneseq, wordseq=$wordseq\n"; last;
         } else {
           $state = $wbtype{$phone};
         }
diff --git a/src/.version b/src/.version
index 819e07a2243..a75b92f1ed7 100644
--- a/src/.version
+++ b/src/.version
@@ -1 +1 @@
-5.0
+5.1
diff --git a/src/INSTALL b/src/INSTALL
index 3f7a01928ba..f40a514c4b6 100644
--- a/src/INSTALL
+++ b/src/INSTALL
@@ -6,14 +6,24 @@ compilation, see ../windows/INSTALL.
 You must first have completed the installation steps in ../tools/INSTALL
 (compiling OpenFst; getting ATLAS and CLAPACK headers).
 
-The installation instructions are:
-./configure --shared
-make depend
-make
-
-Note that "make" takes a long time; you can speed it up by running make
-in parallel if you have multiple CPUs, for instance
- make depend -j 8
- make -j 8
+The installation instructions are
+
+  ./configure --shared
+  make depend
+  make
+
+Note that "make" takes a long time. You can speed it up by running make
+in parallel if you have multiple CPUs, e.g. to use 8 CPUs
+
+  make depend -j 8
+  make -j 8
+
+Kaldi requires a relatively recent C++ compiler with C++11 support,
+e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
+default compiler does not support C++11, you can specify a C++11 compliant
+compiler by setting the CXX environment variable, e.g.
+
+  CXX=g++-4.8 ./configure --shared
+
 For more information, see documentation at http://kaldi-asr.org/doc/
 and click on "The build process (how Kaldi is compiled)".
diff --git a/src/Makefile b/src/Makefile
index 8bc18b254e9..52b23261b76 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -31,9 +31,15 @@ include kaldi.mk
 
 # Reset the default goal, so that the all target will become default
 .DEFAULT_GOAL :=
-all: checkversion test_dependencies kaldi.mk mklibdir $(SUBDIRS)
+all: 
+	$(MAKE) checkversion 
+	$(MAKE) kaldi.mk 
+	$(MAKE) mklibdir
+	$(MAKE) subdirs
 	-echo Done
 
+subdirs: $(SUBDIRS)
+
 mklibdir:
 	test -d $(KALDILIBDIR) || mkdir $(KALDILIBDIR)
 
@@ -51,8 +57,10 @@ checkversion:
 ifeq ($(shell ./configure --version),$(CONFIGURE_VERSION))
 	@echo "The version of configure script matches kaldi.mk version. Good."
 else
+	@echo ""
 	@echo "The kaldi.mk file was generated using a different version of configure script. Please rerun the configure again"
 	@test -f ./kaldi.mk && echo  "Hint: Previous configure command line: " && head -n 2 ./kaldi.mk | grep configure | sed 's/^# *//g'
+	@echo ""
 	@false
 endif
 
@@ -88,23 +96,12 @@ kaldi.mk:
 	@[ -f kaldi.mk ] || { echo "kaldi.mk does not exist; you have to run ./configure"; exit 1; }
 
 # Compile optional stuff
-ext: test_dependencies ext_depend $(SUBDIRS) $(EXT_SUBDIRS)
+ext: ext_depend $(SUBDIRS) $(EXT_SUBDIRS)
 	-echo Done
 
-ifndef OPENFST_VER
-$(error Please rerun configure: OPENFST_VER is not defined, likely kaldi.mk was produced by older configure script.)
-endif
-# Note: OPENFST_VER is determined by configure and added to kaldi.mk
-OPENFST_VER_NUM := $(shell echo $(OPENFST_VER) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
-test_dependencies:
-ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10302)","1")
-	$(error OpenFst $(OPENFST_VER) is not supported. You now need OpenFst >= 1.3.2.)
-endif
-
 check_portaudio:
 	@[ -d ../tools/portaudio ] || ( cd ../tools;  ./install_portaudio.sh )
 
-
 clean: rmlibdir
 	-for x in $(SUBDIRS) $(EXT_SUBDIRS); do $(MAKE) -C $$x clean; done
 
@@ -145,7 +142,7 @@ $(SUBDIRS) : mklibdir
 	$(MAKE) -C $@
 
 .PHONY: $(EXT_SUBDIRS)
-$(EXT_SUBDIRS) : mklibdir
+$(EXT_SUBDIRS) : mklibdir ext_depend
 	$(MAKE) -C $@
 
 
@@ -184,4 +181,3 @@ onlinebin: base matrix util feat tree gmm transform sgmm2 fstext hmm lm decoder
 online: decoder gmm transform feat matrix util base lat hmm thread tree
 online2: decoder gmm transform feat matrix util base lat hmm thread tree ivector cudamatrix nnet2 nnet3 chain
 kws: base util thread hmm tree matrix lat
-
diff --git a/src/base/get_version.sh b/src/base/get_version.sh
index bf5efa8c14a..d6c6c975a4d 100755
--- a/src/base/get_version.sh
+++ b/src/base/get_version.sh
@@ -54,20 +54,20 @@ elif [ "$(git rev-parse --is-inside-work-tree 2>/dev/null)" != true ]; then
   echo "$0: Using the version number \"$version\" specified in src/.version."
 else
   # Figure out patch number.
-  version_commit=$(git log -1 --pretty=oneline ../.version | cut -f 1 -d ' ')
-  patch_number=$(git rev-list ${version_commit}..HEAD | wc -l)
+  version_commit=$(git log -1 --pretty=oneline ../.version | awk '{print $1}')
+  patch_number=$(git rev-list ${version_commit}..HEAD | wc -l | awk '{print $1}')
   version="$version.$patch_number"
 
   # Check for uncommitted changes in src/.
-  uncommitted_changes=$(git diff-index HEAD .. | wc -l)
+  uncommitted_changes=$(git diff-index HEAD -- .. | wc -l | awk '{print $1}')
   if [ $uncommitted_changes -gt 0 ]; then
     # Add suffix ~N if there are N files in src/ with uncommitted changes
     version="$version~$uncommitted_changes"
   fi
 
   # Figure out HEAD commit SHA-1.
-  head_commit=$(git log -1 --pretty=oneline | cut -f 1 -d ' ')
-  head_commit_short=$(git log -1 --oneline --abbrev=4 | cut -f 1 -d ' ')
+  head_commit=$(git log -1 --pretty=oneline | awk '{print $1}')
+  head_commit_short=$(git log -1 --oneline --abbrev=4 | awk '{print $1}')
   version="$version-${head_commit_short}"
 fi
 
diff --git a/src/base/kaldi-utils.h b/src/base/kaldi-utils.h
index 47c60b4b01d..2cfecdcc7db 100644
--- a/src/base/kaldi-utils.h
+++ b/src/base/kaldi-utils.h
@@ -113,8 +113,7 @@ void Sleep(float seconds);
           (reinterpret_cast<char*>(&a))[1]=t;}
 
 
-// Makes copy constructor and operator= private.  Same as in compat.h of OpenFst
-// toolkit.
+// Makes copy constructor and operator= private.
 #define KALDI_DISALLOW_COPY_AND_ASSIGN(type)    \
   type(const type&);                  \
   void operator = (const type&)
@@ -156,4 +155,3 @@ template<> class KaldiCompileTimeAssert<true> {
 #endif
 
 #endif  // KALDI_BASE_KALDI_UTILS_H_
-
diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
index b370dbc7f18..2a76000cfae 100644
--- a/src/bin/ali-to-phones.cc
+++ b/src/bin/ali-to-phones.cc
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
         "Usage:  ali-to-phones  [options] <model> <alignments-rspecifier> "
         "<phone-transcript-wspecifier|ctm-wxfilename>\n"
         "e.g.: \n"
-        " ali-to-phones 1.mdl ark:1.ali ark:phones.tra\n"
+        " ali-to-phones 1.mdl ark:1.ali ark:-\n"
         "or:\n"
         " ali-to-phones --ctm-output 1.mdl ark:1.ali 1.ctm\n"
         "See also: show-alignments lattice-align-phones\n";
diff --git a/src/bin/align-equal.cc b/src/bin/align-equal.cc
index 3d35ee33daa..a3bc40dc236 100644
--- a/src/bin/align-equal.cc
+++ b/src/bin/align-equal.cc
@@ -36,10 +36,13 @@ int main(int argc, char *argv[]) {
     using fst::VectorFst;
     using fst::StdArc;
 
-    const char *usage =  "Write equally spaced alignments of utterances (to get training started)\n"
-        "Usage:  align-equal <tree-in> <model-in> <lexicon-fst-in> <features-rspecifier> <transcriptions-rspecifier> <alignments-wspecifier>\n"
+    const char *usage = "Write equally spaced alignments of utterances "
+        "(to get training started)\n"
+        "Usage:  align-equal <tree-in> <model-in> <lexicon-fst-in> "
+        "<features-rspecifier> <transcriptions-rspecifier> <alignments-wspecifier>\n"
         "e.g.: \n"
-        " align-equal 1.tree 1.mdl lex.fst scp:train.scp ark:train.tra ark:equal.ali\n";
+        " align-equal 1.tree 1.mdl lex.fst scp:train.scp "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:equal.ali\n";
 
     ParseOptions po(usage);
     std::string disambig_rxfilename;
diff --git a/src/bin/compile-train-graphs.cc b/src/bin/compile-train-graphs.cc
index 6636ef88878..874d079376e 100644
--- a/src/bin/compile-train-graphs.cc
+++ b/src/bin/compile-train-graphs.cc
@@ -37,9 +37,11 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Creates training graphs (without transition-probabilities, by default)\n"
         "\n"
-        "Usage:   compile-train-graphs [options] <tree-in> <model-in> <lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
+        "Usage:   compile-train-graphs [options] <tree-in> <model-in> "
+        "<lexicon-fst-in> <transcriptions-rspecifier> <graphs-wspecifier>\n"
         "e.g.: \n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra ark:graphs.fsts\n";
+        " compile-train-graphs tree 1.mdl lex.fst "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:graphs.fsts\n";
     ParseOptions po(usage);
 
     TrainingGraphCompilerOptions gopts;
diff --git a/src/bin/convert-ali.cc b/src/bin/convert-ali.cc
index 3a52b7904a0..89fe838638c 100644
--- a/src/bin/convert-ali.cc
+++ b/src/bin/convert-ali.cc
@@ -39,6 +39,7 @@ int main(int argc, char *argv[]) {
 
     int32 frame_subsampling_factor = 1;
     bool reorder = true;
+    bool repeat_frames = false;
 
     std::string phone_map_rxfilename;
     ParseOptions po(usage);
@@ -48,6 +49,11 @@ int main(int argc, char *argv[]) {
     po.Register("reorder", &reorder,
                 "True if you want the converted alignments to be 'reordered' "
                 "versus the way they appear in the HmmTopology object");
+    po.Register("repeat-frames", &repeat_frames,
+                "Only relevant when frame-subsampling-factor != 1.  If true, "
+                "repeat frames of alignment by 'frame-subsampling-factor' "
+                "after alignment conversion, to keep the alignment the same "
+                "length as the input alignment.");
     po.Register("frame-subsampling-factor", &frame_subsampling_factor,
                 "Can be used in converting alignments to reduced frame rates.");
 
@@ -98,6 +104,7 @@ int main(int argc, char *argv[]) {
                            new_ctx_dep,
                            old_alignment,
                            frame_subsampling_factor,
+                           repeat_frames,
                            reorder,
                            (phone_map_rxfilename != "" ? &phone_map : NULL),
                            &new_alignment)) {
diff --git a/src/bin/phones-to-prons.cc b/src/bin/phones-to-prons.cc
index f9b9291a90b..0d7ab12c232 100644
--- a/src/bin/phones-to-prons.cc
+++ b/src/bin/phones-to-prons.cc
@@ -80,7 +80,8 @@ int main(int argc, char *argv[]) {
         "<word-end-sym> <phones-rspecifier> <words-rspecifier> <prons-wspecifier>\n"
         "e.g.: \n"
         " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
-        "  phones-to-prons L_align.fst 46 47 ark:- 1.tra ark:1.prons\n";
+        "  phones-to-prons L_align.fst 46 47 ark:- "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:1.prons\n";
 
     ParseOptions po(usage);
     po.Read(argc, argv);
@@ -170,11 +171,7 @@ int main(int argc, char *argv[]) {
                    << "not reach end-state, or mismatched lexicon.)";
         if (g_kaldi_verbose_level >= 2) {
           KALDI_LOG << "phn2word FST is below:";
-#ifdef HAVE_OPENFST_GE_10400
           fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true, "\t");
-#else
-          fst::FstPrinter<StdArc> fstprinter(phn2word, NULL, NULL, NULL, false, true);
-#endif
           fstprinter.Print(&std::cerr, "standard error");
           KALDI_LOG << "phone sequence is: ";
           for (size_t i = 0; i < phones.size(); i++)
@@ -219,5 +216,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/bin/prons-to-wordali.cc b/src/bin/prons-to-wordali.cc
index 8e89d7cc644..a6331043500 100644
--- a/src/bin/prons-to-wordali.cc
+++ b/src/bin/prons-to-wordali.cc
@@ -52,8 +52,8 @@ int main(int argc, char *argv[]) {
         " <phone-lengths-rspecifier> <wordali-wspecifier>\n"
         "e.g.: \n"
         " ali-to-phones 1.mdl ark:1.ali ark:- | \\\n"
-        "  phones-to-prons L_align.fst 46 47 ark:- 1.tra ark:- | \\\n"
-        "  prons-to-wordali ark:- \\\n"
+        "  phones-to-prons L_align.fst 46 47 ark:- 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "  ark:- | prons-to-wordali ark:- \\\n"
         "    \"ark:ali-to-phones --write-lengths 1.mdl ark:1.ali ark:-|\" ark:1.wali\n";
     
     ParseOptions po(usage);
diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc
index 6f494a0c562..5386f959b1f 100644
--- a/src/chain/chain-den-graph.cc
+++ b/src/chain/chain-den-graph.cc
@@ -186,7 +186,7 @@ void MinimizeAcceptorNoPush(fst::StdVectorFst *fst) {
   fst::EncodeMapper<fst::StdArc> encoder(fst::kEncodeLabels | fst::kEncodeWeights,
                                          fst::ENCODE);
   fst::Encode(fst, &encoder);
-  fst::AcceptorMinimize(fst);
+  fst::internal::AcceptorMinimize(fst);
   fst::Decode(fst, encoder);
 }
 
diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index e38fbca745f..33d3c74e3a3 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -606,9 +606,9 @@ void TestRanges() {
 
 int main() {
   using namespace kaldi;
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -621,7 +621,7 @@ int main() {
     }
     kaldi::chain::TestRanges();
 #if HAVE_CUDA == 1
-    CuDevice::Instantiate().PrintProfile();
-#endif
   }
+  CuDevice::Instantiate().PrintProfile();
+#endif
 }
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index aad1320e0a0..b5597b15667 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -804,26 +804,5 @@ void GetWeightsForRanges(int32 range_length,
 }
 
 
-void GetWeightsForRangesNew(int32 range_length,
-                            int32 num_frames_zeroed,
-                            const std::vector<int32> &range_starts,
-                            std::vector<Vector<BaseFloat> > *weights) {
-  KALDI_ASSERT(range_length > 0 && num_frames_zeroed * 2 < range_length);
-  int32 num_ranges = range_starts.size();
-  weights->resize(num_ranges);
-  for (int32 i = 0; i < num_ranges; i++) {
-    (*weights)[i].Resize(range_length);
-    (*weights)[i].Set(1.0);
-  }
-  if (num_frames_zeroed == 0)
-    return;
-  for (int32 i = 1; i < num_ranges; i++)
-    (*weights)[i].Range(0, num_frames_zeroed).Set(0.0);
-  for (int32 i = 0; i + 1 < num_ranges; i++)
-    (*weights)[i].Range(range_length - num_frames_zeroed,
-                        num_frames_zeroed).Set(0.0);
-}
-
-
 }  // namespace chain
 }  // namespace kaldi
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index 2dda8baf1e4..a94f68ade90 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -402,27 +402,6 @@ void GetWeightsForRanges(int32 range_length,
                          std::vector<Vector<BaseFloat> > *weights);
 
 
-/// This is a newer version of GetWeightsForRanges with a simpler behavior
-/// than GetWeightsForRanges and a different purpose.  Instead of aiming to
-/// create weights that sum to one over the whole file, the purpose is to
-/// zero out the derivative weights for a certain number of frames to each
-/// side of every 'cut point' in the numerator lattice [by numerator lattice,
-/// what I mean is the FST that we automatically generate from the numerator
-/// alignment or lattice].  So we don't zero out the weights for the very
-/// beginning or very end of each original utterance, just those where
-/// we split the utterance into pieces.  We believe there is an incentive
-/// for the network to produce deletions near the edges, and this aims to fix
-/// this problem.
-/// range_length is the length of each range of times (so range_starts[0]
-/// represents the start of a range of t values of length 'range_length'
-/// and so range_starts[1] etc.), and num_frames_zeroed is the number of frames
-/// on each side of the cut point on which we are supposed to zero out the
-/// derivative.
-void GetWeightsForRangesNew(int32 range_length,
-                            int32 num_frames_zeroed,
-                            const std::vector<int32> &range_starts,
-                            std::vector<Vector<BaseFloat> > *weights);
-
 
 typedef TableWriter<KaldiObjectHolder<Supervision> > SupervisionWriter;
 typedef SequentialTableReader<KaldiObjectHolder<Supervision> > SequentialSupervisionReader;
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 1bf0201fbfa..53de69a0e07 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -30,7 +30,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const Supervision &supervision,
                               const CuMatrixBase<BaseFloat> &nnet_output,
                               BaseFloat *objf,
-                              BaseFloat *l2_term,                              
+                              BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
                               CuMatrixBase<BaseFloat> *xent_output_deriv) {
@@ -86,7 +86,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
   // for different frames of the sequences.  As expected, they are
   // smaller towards the edges of the sequences (due to the penalization
   // of 'incorrect' pdf-ids.
-  if (GetVerboseLevel() >= 1) {
+  if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) {
     int32 tot_frames = nnet_output_deriv->NumRows(),
  frames_per_sequence = supervision.frames_per_sequence,
        num_sequences = supervision.num_sequences;
diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc
index 3f092879b6e..b195f5ba1fb 100644
--- a/src/chainbin/nnet3-chain-acc-lda-stats.cc
+++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc
@@ -54,7 +54,7 @@ class NnetChainLdaStatsAccumulator {
     NnetComputer computer(options, computation, nnet_, NULL);
 
     computer.AcceptInputs(nnet_, eg.inputs);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     AccStatsFromOutput(eg, nnet_output);
   }
@@ -202,5 +202,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/chainbin/nnet3-chain-compute-prob.cc b/src/chainbin/nnet3-chain-compute-prob.cc
index 7f9d688777a..830f1e8cee4 100644
--- a/src/chainbin/nnet3-chain-compute-prob.cc
+++ b/src/chainbin/nnet3-chain-compute-prob.cc
@@ -84,5 +84,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index b0c963595a1..fddaa6c9952 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -201,6 +201,7 @@ void ModifyChainExampleContext(const NnetChainExample &eg,
                                int32 right_context,
                                const int32 frame_subsampling_factor,
                                NnetChainExample *eg_out) {
+  static bool warned_left = false, warned_right = false;
   int32 min_input_t, max_input_t,
         min_output_t, max_output_t;
   if (!ContainsSingleExample(eg, &min_input_t, &max_input_t,
@@ -208,19 +209,31 @@ void ModifyChainExampleContext(const NnetChainExample &eg,
     KALDI_ERR << "Too late to perform frame selection/context reduction on "
               << "these examples (already merged?)";
   if (left_context != -1) {
-    if (min_input_t > min_output_t - left_context)
-      KALDI_ERR << "You requested --left-context=" << left_context
-                << ", but example only has left-context of "
-                <<  (min_output_t - min_input_t);
+    int32 observed_left_context = min_output_t - min_input_t;
+    if (!warned_left && observed_left_context < left_context) {
+      warned_left = true;
+      KALDI_WARN << "You requested --left-context=" << left_context
+                 << ", but example only has left-context of "
+                 <<  observed_left_context
+                 << " (will warn only once; this may be harmless if "
+          "using any --*left-context-initial options)";
+    }
     min_input_t = std::max(min_input_t, min_output_t - left_context);
   }
   if (right_context != -1) {
-    if (max_input_t < max_output_t + right_context + frame_subsampling_factor - 1)
-      KALDI_ERR << "You requested --right-context=" << right_context
-                << ", but example only has right-context of "
-                <<  (max_input_t - max_output_t - frame_subsampling_factor + 1);
-    max_input_t = std::min(max_input_t, max_output_t + right_context
-                  + frame_subsampling_factor - 1);
+    int32 observed_right_context = max_input_t - max_output_t;
+
+    if (right_context != -1) {
+      if (!warned_right && observed_right_context < right_context) {
+        warned_right = true;
+        KALDI_ERR << "You requested --right-context=" << right_context
+                  << ", but example only has right-context of "
+                  << observed_right_context
+                 << " (will warn only once; this may be harmless if "
+            "using any --*right-context-final options.";
+      }
+      max_input_t = std::min(max_input_t, max_output_t + right_context);
+    }
   }
   FilterExample(eg,
                 min_input_t, max_input_t,
@@ -252,7 +265,6 @@ int main(int argc, char *argv[]) {
     bool random = false;
     int32 srand_seed = 0;
     int32 frame_shift = 0;
-    int32 truncate_deriv_weights = 0;
     int32 frame_subsampling_factor = -1;
     BaseFloat keep_proportion = 1.0;
     int32 left_context = -1, right_context = -1;
@@ -269,9 +281,6 @@ int main(int argc, char *argv[]) {
                 "in the supervision data (excluding iVector data) - useful in "
                 "augmenting data.  Note, the outputs will remain at the closest "
                 "exact multiples of the frame subsampling factor");
-    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
-                "If nonzero, the number of initial/final subsample frames that "
-                "will have their derivatives' weights set to zero.");
     po.Register("left-context", &left_context, "Can be used to truncate the "
                 "feature left-context that we output.");
     po.Register("right-context", &right_context, "Can be used to truncate the "
@@ -307,7 +316,7 @@ int main(int argc, char *argv[]) {
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
       std::string key = example_reader.Key();
-      if (frame_shift == 0 && truncate_deriv_weights == 0 &&
+      if (frame_shift == 0 &&
           left_context == -1 && right_context == -1) {
         const NnetChainExample &eg = example_reader.Value();
         for (int32 c = 0; c < count; c++) {
@@ -325,8 +334,6 @@ int main(int argc, char *argv[]) {
                                     frame_subsampling_factor, &eg_out);
         else
           eg_out.Swap(&eg);
-        if (truncate_deriv_weights != 0)
-          TruncateDerivWeights(truncate_deriv_weights, &eg_out);
         for (int32 c = 0; c < count; c++) {
           int32 index = (random ? Rand() : num_written) % num_outputs;
           example_writers[index]->Write(key, eg_out);
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index cc463d179da..bf1e87d2452 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -41,155 +41,103 @@ namespace nnet3 {
 static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const chain::Supervision &supervision,
                         const std::string &utt_id,
                         bool compress,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int32 frames_overlap_per_eg,
-                        int32 frame_subsampling_factor,
-                        int32 cut_zero_frames,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetChainExampleWriter *example_writer) {
   KALDI_ASSERT(supervision.num_sequences == 1);
-  int32 num_feature_frames = feats.NumRows(),
-      num_output_frames = supervision.frames_per_sequence,
-      num_feature_frames_subsampled =
-                             (num_feature_frames + frame_subsampling_factor - 1)/
-                             frame_subsampling_factor;
-  if (num_output_frames != num_feature_frames_subsampled) {
-    // we tolerate deviations in the num-frames if they are very small (1 output
-    // frame).
-
-    if (abs(num_output_frames - num_feature_frames_subsampled) > 1) {
-      KALDI_ERR << "Mismatch in num-frames: chain supervision has "
-                << num_output_frames
-                << " versus features/frame_subsampling_factor = "
-                << num_feature_frames << " / " << frame_subsampling_factor
-                << " = " << num_feature_frames_subsampled
-                << ": check that --frame-subsampling-factor option is set "
-                << "the same as to chain-get-supervision.";
-    }
-    int32 new_num_feature_frames =
-        num_output_frames * frame_subsampling_factor;
-    // add a few frames at the end to make it match up.
-    Matrix<BaseFloat> feats_new(new_num_feature_frames, feats.NumCols(),
-                                kUndefined);
-    int32 min_feature_frames = std::min<int32>(num_feature_frames,
-                                               new_num_feature_frames);
-    feats_new.RowRange(0, min_feature_frames).CopyFromMat(
-        feats.RowRange(0, min_feature_frames));
-    for (int32 i = num_feature_frames; i < new_num_feature_frames; i++)
-      feats_new.Row(i).CopyFromVec(feats.Row(num_feature_frames - 1));
-    return ProcessFile(normalization_fst, feats_new, ivector_feats,
-                       supervision, utt_id, compress, left_context, right_context,
-                       frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor,
-                       cut_zero_frames, num_frames_written, num_egs_written,
-                       example_writer);
-  }
+  int32 num_input_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence;
 
-  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames))
+    return false;  // LengthsMatch() will have printed a warning.
 
-  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
-      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
-      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
+  std::vector<ChunkTimeInfo> chunks;
 
-  if (num_feature_frames_subsampled < frames_per_eg_subsampled) {
-    KALDI_WARN << "Length of features for utterance " << utt_id
-               << " is less than than the frames_per_eg (after sub-sampling).";
-    return false;
-  }
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
 
-  // we don't do any padding, as it would be a bit tricky to pad the 'chain' supervision.
-  // Instead we select ranges of frames that fully fit within the file;  these
-  // might slightly overlap with each other or have gaps.
-  std::vector<int32> range_starts_subsampled;
-  chain::SplitIntoRanges(num_feature_frames_subsampled -
-                         frames_overlap_subsampled,
-                         frames_shift_subsampled,
-                         &range_starts_subsampled);
-  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
-  // that we tend to avoid having nonzero weights on the derivatives that are
-  // too close to the edge of the corresponding 'range' (these derivatives close
-  // to the edge are not as accurate as they could be, because when we split we
-  // don't know the correct alphas and betas).
-  std::vector<Vector<BaseFloat> > deriv_weights;
-  if (cut_zero_frames >= 0)
-    chain::GetWeightsForRangesNew(frames_per_eg_subsampled,
-                                  cut_zero_frames / frame_subsampling_factor,
-                                  range_starts_subsampled,
-                                  &deriv_weights);
-  else
-    chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                               range_starts_subsampled,
-                               &deriv_weights);
-
-  if (range_starts_subsampled.empty()) {
-    KALDI_WARN << "No output for utterance " << utt_id
-               << " (num-frames=" << num_feature_frames
-               << ") because too short for --frames-per-eg="
-               << frames_per_eg;
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
     return false;
   }
-  chain::SupervisionSplitter splitter(supervision);
 
-  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
-    int32 range_start_subsampled = range_starts_subsampled[i],
-        range_start = range_start_subsampled * frame_subsampling_factor;
+  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
+
+  chain::SupervisionSplitter sup_splitter(supervision);
+
+  for (size_t c = 0; c < chunks.size(); c++) {
+    ChunkTimeInfo &chunk = chunks[c];
+
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
 
     chain::Supervision supervision_part;
-    splitter.GetFrameRange(range_start_subsampled,
-                           frames_per_eg_subsampled,
-                           &supervision_part);
+    sup_splitter.GetFrameRange(start_frame_subsampled,
+                               num_frames_subsampled,
+                               &supervision_part);
 
     if (normalization_fst.NumStates() > 0 &&
         !AddWeightToSupervisionFst(normalization_fst,
                                    &supervision_part)) {
-      KALDI_WARN << "For utterance " << utt_id << ", frames "
-                 << range_start << " to " << (range_start + frames_per_eg)
+      KALDI_WARN << "For utterance " << utt_id << ", feature frames "
+                 << chunk.first_frame << " to "
+                 << (chunk.first_frame + chunk.num_frames)
                  << ", FST was empty after composing with normalization FST. "
                  << "This should be extremely rare (a few per corpus, at most)";
-      return false;
     }
 
     int32 first_frame = 0;  // we shift the time-indexes of all these parts so
                             // that the supervised part starts from frame 0.
+
+    SubVector<BaseFloat> output_weights(
+        &(chunk.output_weights[0]),
+        static_cast<int32>(chunk.output_weights.size()));
+
     NnetChainSupervision nnet_supervision("output", supervision_part,
-                                          deriv_weights[i],
-                                          first_frame, frame_subsampling_factor);
+                                          output_weights,
+                                          first_frame,
+                                          frame_subsampling_factor);
 
     NnetChainExample nnet_chain_eg;
     nnet_chain_eg.outputs.resize(1);
     nnet_chain_eg.outputs[0].Swap(&nnet_supervision);
     nnet_chain_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
 
-    int32 tot_frames = left_context + frames_per_eg + right_context;
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
 
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
-      int32 t = range_start + j;
-      if (t < 0) t = 0;
-      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
-      SubVector<BaseFloat> src(feats, t),
-          dest(input_frames, j + left_context);
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
+
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
+      if (t2 < 0) t2 = 0;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
+      SubVector<BaseFloat> src(feats, t2),
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
-    NnetIo input_io("input", - left_context,
-                    input_frames);
+    NnetIo input_io("input", -chunk.left_context, input_frames);
     nnet_chain_eg.inputs[0].Swap(&input_io);
 
     if (ivector_feats != NULL) {
       // if applicable, add the iVector feature.
       // choose iVector from a random frame in the chunk
-      int32 ivector_frame = RandInt(range_start, range_start + frames_per_eg - 1);
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (ivector_frame >= ivector_feats->NumRows())
-        ivector_frame = ivector_feats->NumRows() - 1;
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       NnetIo ivector_io("ivector", 0, ivector);
       nnet_chain_eg.inputs[1].Swap(&ivector_io);
     }
@@ -198,13 +146,10 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst,
       nnet_chain_eg.Compress();
 
     std::ostringstream os;
-    os << utt_id << "-" << range_start;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += frames_per_eg;
-    *num_egs_written += 1;
-
     example_writer->Write(key, nnet_chain_eg);
   }
   return true;
@@ -239,44 +184,31 @@ int main(int argc, char *argv[]) {
         "chain-get-supervision.\n";
 
     bool compress = true;
-    int32 left_context = 0, right_context = 0, num_frames = 1,
-        num_frames_overlap = 0, length_tolerance = 100,
-        cut_zero_frames = -1,
-        frame_subsampling_factor = 1;
+    int32 length_tolerance = 100, online_ivector_period = 1;
+
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
 
     int32 srand_seed = 0;
-    std::string ivector_rspecifier;
+    std::string online_ivector_rspecifier;
 
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs in "
-                "compressed format (recommended)");
-    po.Register("cut-zero-frames", &cut_zero_frames, "Number of frames "
-                "(measured before subsampling) to zero the derivative on each "
-                "side of a cut point (if set, activates new-style derivative "
-                "weights)");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.  Will be rounded up to a multiple "
-                "of --frame-subsampling-factor.");
-    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
-                "overlap between each example (could be useful in conjunction "
-                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
-                "Each time we shift by --num-frames minus --num-frames-overlap.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as a matrix.");
-    po.Register("srand", &srand_seed, "Seed for random number generator "
-                "(only relevant if --pick-random-ivector=true)");
+                "compressed format.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                "if the frame-rate at the output will be less than the "
-                "frame-rate of the input");
+    eg_config.Register(&po);
 
     po.Read(argc, argv);
-    
+
     srand(srand_seed);
 
     if (po.NumArgs() < 3 || po.NumArgs() > 4) {
@@ -284,12 +216,6 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
-    if (num_frames <= 0 || left_context < 0 || right_context < 0 ||
-        length_tolerance < 0 || frame_subsampling_factor <= 0)
-      KALDI_ERR << "One of the integer options is out of the allowed range.";
-    RoundUpNumFrames(frame_subsampling_factor,
-                     &num_frames, &num_frames_overlap);
-
     std::string
         normalization_fst_rxfilename,
         feature_rspecifier,
@@ -307,6 +233,9 @@ int main(int argc, char *argv[]) {
       examples_wspecifier = po.GetArg(4);
     }
 
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
+
     fst::StdVectorFst normalization_fst;
     if (!normalization_fst_rxfilename.empty()) {
       ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst);
@@ -317,10 +246,10 @@ int main(int argc, char *argv[]) {
     chain::RandomAccessSupervisionReader supervision_reader(
         supervision_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
 
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
+    int32 num_err = 0;
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
@@ -330,45 +259,41 @@ int main(int argc, char *argv[]) {
         num_err++;
       } else {
         const chain::Supervision &supervision = supervision_reader.Value(key);
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
-        if (ivector_feats != NULL &&
-            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
-                     << " exceeds tolerance " << length_tolerance;
+                     << " and iVectors " << online_ivector_feats->NumRows()
+                     << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
-        if (ProcessFile(normalization_fst, feats, ivector_feats, supervision,
-                        key, compress,
-                        left_context, right_context, num_frames,
-                        num_frames_overlap, frame_subsampling_factor,
-                        cut_zero_frames, &num_frames_written, &num_egs_written,
-                        &example_writer))
-          num_done++;
-        else
+
+        if (!ProcessFile(normalization_fst, feats,
+                         online_ivector_feats, online_ivector_period,
+                         supervision, key, compress,
+                         &utt_splitter, &example_writer))
           num_err++;
       }
     }
-
-    KALDI_LOG << "Finished generating nnet3-chain examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " frames in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
index 249b5cec0c0..82dee560ff4 100644
--- a/src/chainbin/nnet3-chain-merge-egs.cc
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -41,19 +41,11 @@ int main(int argc, char *argv[]) {
         "nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... \n"
         "See also nnet3-chain-copy-egs\n";
 
-    bool compress = false;
-    int32 minibatch_size = 64;
-    bool discard_partial_minibatches = true;
+
+    ExampleMergingConfig merging_config("64");  // 64 is default minibatch size.
 
     ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
-                "when merging (see also --measure-output-frames)");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk");
-    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
-                "discard any partial minibatches of 'uneven' size that may be "
-                "encountered at the end; 'true' is recommended, to avoid "
-                "incurring compilation costs.");
+    merging_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -68,38 +60,19 @@ int main(int argc, char *argv[]) {
     SequentialNnetChainExampleReader example_reader(examples_rspecifier);
     NnetChainExampleWriter example_writer(examples_wspecifier);
 
-    std::vector<NnetChainExample> examples;
-    examples.reserve(minibatch_size);
-
-    int64 num_read = 0, num_written = 0;
-    while (!example_reader.Done()) {
+    merging_config.ComputeDerived();
+    ChainExampleMerger merger(merging_config, &example_writer);
+    for (; !example_reader.Done(); example_reader.Next()) {
       const NnetChainExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-
-      bool minibatch_ready =
-          static_cast<int32>(examples.size()) >= minibatch_size;
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (!discard_partial_minibatches &&
-          (example_reader.Done() && !examples.empty()))) {
-        NnetChainExample merged_eg;
-        MergeChainExamples(compress, &examples, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-      }
+      merger.AcceptExample(new NnetChainExample(cur_eg));
     }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
+    // the merger itself prints the necessary diagnostics.
+    merger.Finish();
+    return merger.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
+
+
diff --git a/src/configure b/src/configure
index bb8f5d4cb4d..4bfe6bc8470 100755
--- a/src/configure
+++ b/src/configure
@@ -1,11 +1,11 @@
 #!/bin/bash
-#
+
 # This configure script is hand-generated, not auto-generated.
 # It creates the file kaldi.mk, which is %included by the Makefiles
 # in the subdirectories.
-# The file kaldi.mk is editable by hand-- for example, you may want to
+# The file kaldi.mk is editable by hand -- for example, you may want to
 # remove the options -g -O0 -DKALDI_PARANOID, or edit the
-# -DKALDI_DOUBLE_PRECISION option (to be 1 not 0),
+# DOUBLE_PRECISION variable (to be 1 not 0).
 
 
 #  Example command lines:
@@ -15,31 +15,94 @@
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes
 # ./configure --mkl-root=/opt/intel/mkl --threaded-math=yes --mkl-threading=tbb
 #        # This is for MKL 11.3, which does not seem  to provide Intel OMP libs
-# ./configure  --openblas-root=../tools/OpenBLAS/install  # before doing
-#        # this, cd to ../tools and type "make openblas".  Note:
-#        # this is not working correctly on all platforms, do "make test"
+# ./configure --openblas-root=../tools/OpenBLAS/install
+#        # Before doing this, cd to ../tools and type "make openblas".
+#        # Note: this is not working correctly on all platforms, do "make test"
 #        # and look out for segmentation faults.
 # ./configure --atlas-root=../tools/ATLAS/build
 # ./configure --use-cuda=no   # disable CUDA detection (will build cpu-only
 #                             # version of kaldi even on CUDA-enabled machine
+# ./configure --static --fst-root=/opt/cross/armv8hf \
+# --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
 #        # Cross compile for armv8hf, this assumes that you have openfst built
 #        # with the armv8-rpi3-linux-gnueabihf toolchain and installed to
-#        # /opt/cross/armv8hf.  It also assumes that you have an ATLAS library
+#        # /opt/cross/armv8hf. It also assumes that you have an ATLAS library
 #        # built for the target install to /opt/cross/armv8hf and that the
 #        # armv8-rpi3-linux-gnueabihf toolchain is available in your path
-# ./configure --static --fst-root=/opt/cross/armv8hf --atlas-root=/opt/cross/armv8hf --host=armv8-rpi3-linux-gnueabihf
-#        # Cross compile for Android on arm
-#        # The only difference here is the addtion of the the --android-includes
-#        # flag because the toolchains produced by the Android NDK don't always
-#        # include the C++ stdlib headers in the normal cross compile include
-#        # path
 # ./configure --static --openblas-root=/opt/cross/arm-linux-androideabi \
-# --fst-root=/opt/cross/arm-linux-androideabi --host=arm-linux-androideabi \
-# --fst-version=1.4.1 --android-includes=/opt/cross/arm-linux-androideabi/sysroot/usr/include
+# --fst-root=/opt/cross/arm-linux-androideabi --fst-version=1.4.1 \
+# --android-incdir=/opt/cross/arm-linux-androideabi/sysroot/usr/include \
+# --host=arm-linux-androideabi
+#        # Cross compile for Android on arm. The only difference here is the
+#        # addition of the the --android-includes flag because the toolchains
+#        # produced by the Android NDK don't always include the C++ stdlib
+#        # headers in the normal cross compile include path.
+
+# This should be incremented after any significant change to the configure
+# script, i.e. any change affecting kaldi.mk or the build system as a whole.
+CONFIGURE_VERSION=6
 
-#This should be incremented after every significant change of the configure script
-#I.e. after each change that affects the kaldi.mk or the build system as whole
-CONFIGURE_VERSION=5
+if ! [ -x "$PWD/configure" ]; then
+  echo 'You must run "configure" from the src/ directory.'
+  exit 1
+fi
+
+function usage {
+  cat <<EOF
+'configure' configures Kaldi installation.
+
+Usage: [VAR=VALUE]... $0 [OPTION]...
+
+The default configuration is to build and link against static Kaldi libraries.
+OpenFst and Math libraries are linked dynamically.
+
+Configuration options:
+  --help                Display this help message and exit
+  --version             Display the version of 'configure' and exit
+  --static              Build and link against static libraries [default=no]
+  --shared              Build and link against shared libraries [default=no]
+  --use-cuda            Build with CUDA [default=yes]
+  --cudatk-dir=DIR      CUDA toolkit directory
+  --double-precision    Build with double precision floats [default=no]
+  --static-fst          Build with static OpenFst libraries [default=no]
+  --fst-root=DIR        OpenFst root directory [default=../tools/openfst/]
+  --fst-version=STR     OpenFst version string
+  --mathlib=LIB         Math library [default=ATLAS]
+                        Supported libraries: ATLAS, MKL, CLAPACK, OPENBLAS.
+  --static-math         Build with static math libraries [default=no]
+  --threaded-math       Build with multi-threaded math libraries [default=no]
+  --threaded-atlas      Build with multi-threaded ATLAS libraries [default=no]
+  --atlas-root=DIR      ATLAS root directory [default=../tools/ATLAS/]
+  --openblas-root=DIR   OpenBLAS root directory
+  --clapack-root=DIR    CLAPACK root directory
+  --mkl-root=DIR        MKL root directory
+  --mkl-libdir=DIR      MKL library directory
+  --mkl-threading=LIB   MKL threading layer [default=sequential]
+                        Supported layers: sequential, iomp, tbb, gomp.
+  --omp-libdir=DIR      OpenMP directory
+  --speex-root=DIR      SPEEX root directory
+  --speex-libdir=DIR    SPEEX library directory
+  --speex-incdir=DIR    SPEEX include directory
+  --host=HOST           Host triple in the format 'cpu-vendor-os'
+                        If provided, it is prepended to all toolchain programs.
+  --android-incdir=DIR  Andraid include directory
+
+Following environment variables can be used to override the default toolchain.
+  CXX         C++ compiler [default=g++]
+  AR          Archive maintenance utility [default=ar]
+  AS          Assembler [default=as]
+  RANLIB      Archive indexing utility [default=ranlib]
+
+If a host triple is provided, it is prepended to CXX, AR, AS and RANLIB.
+
+Following environment variables can be used to provide additional flags to the
+compiler/linker.
+  CXXFLAGS    Additional C++ compiler flags, e.g. -I<include-dir>
+  LDFLAGS     Additional linker flags, e.g. -L<lib-dir>
+  LDLIBS      Additional libraries to pass to the linker, e.g. -l<lib>
+
+EOF
+}
 
 function rel2abs {
   if [ ! -z "$1" ]; then
@@ -64,188 +127,6 @@ function is_set {
   fi
 }
 
-
-
-##   First do some checks.  These verify that all the things are
-##   here that should be here.
-if ! [ -x "$PWD/configure" ]; then
-  echo 'You must run "configure" from the src/ directory.'
-  exit 1
-fi
-
-## Default locations for FST and linear algebra libraries.
-MATHLIB='ATLAS'
-ATLASROOT=`rel2abs ../tools/ATLAS/`
-FSTROOT=`rel2abs ../tools/openfst`
-
-# Avoid using any variables that are set in the shell.
-unset MKLROOT
-unset CLAPACKROOT
-unset OPENBLASROOT
-unset MKLLIBDIR
-unset HOST
-
-function usage {
-  echo 'Usage: ./configure [--static|--shared] [--threaded-atlas={yes|no}] [--atlas-root=ATLASROOT] [--fst-root=FSTROOT]
-  [--openblas-root=OPENBLASROOOT] [--clapack-root=CLAPACKROOT] [--mkl-root=MKLROOT] [--mkl-libdir=MKLLIBDIR]
-  [--omp-libdir=OMPDIR] [--static-fst={yes|no}] [--static-math={yes|no}] [--threaded-math={yes|no}] [--mathlib=ATLAS|MKL|CLAPACK|OPENBLAS]
-  [--use-cuda={yes|no}] [--cudatk-dir=CUDATKDIR][--mkl-threading=sequential|iomp|tbb|gomp] [--fst-version=VERSION]
-  [--host=HOST] [--android-includes=ANDROID_INC_DIR]';
-}
-
-threaded_atlas=false #  By default, use the un-threaded version of ATLAS.
-threaded_math=${threaded_atlas}
-static_math=false
-static_fst=false
-use_cuda=true
-dynamic_kaldi=false
-mkl_threading=sequential
-# HOST and TARGET_ARCH are used when cross compiling, the user will specify HOST via the --host
-# switch.  TARGET_ARCH will be the first value in HOST if set, and `uname -m` otherwise
-HOST=""
-TARGET_ARCH=""
-android=false
-
-cmd_line="$0 $@"  # Save the command line to include in kaldi.mk
-
-while [ $# -gt 0 ];
-do
-  case "$1" in
-  --help)
-    usage; exit 0 ;;
-  --version)
-    echo $CONFIGURE_VERSION; exit 0 ;;
-  --static)
-    dynamic_kaldi=false;
-    static_math=true;
-    static_fst=true;
-    shift ;;
-  --shared)
-    dynamic_kaldi=true;
-    static_math=false;
-    static_fst=false;
-    shift ;;
-  --atlas-root=*)
-    ATLASROOT=`read_dirname $1`;
-    shift ;;
-  --threaded-atlas=yes)
-    threaded_atlas=true;
-    shift ;;
-  --threaded-atlas=no)
-    threaded_atlas=false;
-    shift ;;
-  --threaded-math=yes)
-    threaded_atlas=true;
-    threaded_math=true;
-    mkl_threading=iomp
-    shift ;;
-  --threaded-math=no)
-    threaded_atlas=false;
-    threaded_math=false;
-    mkl_threading=sequential
-    shift ;;
-  --use-cuda=yes)
-    use_cuda=true;
-    shift ;;
-  --use-cuda=no)
-    use_cuda=false;
-    shift ;;
-  --static-math=yes)
-    static_math=true;
-    shift ;;
-  --static-math=no)
-    static_math=false;
-    shift ;;
-  --static-fst=yes)
-    static_fst=true;
-    shift ;;
-  --static-fst=no)
-    static_fst=false;
-    shift ;;
-  --mkl-threading=sequential)
-    threaded_atlas=false;
-    threaded_math=false;
-    mkl_threading=sequential;
-    shift ;;
-  --mkl-threading=*)
-    mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
-    threaded_atlas=true;
-    threaded_math=true;
-    shift ;;
-  --fst-root=*)
-    FSTROOT=`read_dirname $1`;
-    shift ;;
-  --clapack-root=*)
-    CLAPACKROOT=`read_dirname $1`;
-    shift ;;
-  --openblas-root=*)
-    OPENBLASROOT=`read_dirname $1`;
-    shift ;;
-  --mkl-root=*)
-    MKLROOT=`read_dirname $1`;
-    shift ;;
-  --mkl-libdir=*)
-    MKLLIBDIR=`read_dirname $1`;
-    shift ;;
-  --speex-root=*)
-    SPEEXROOT=`read_dirname $1`;
-    shift ;;
-  --speex-libdir=*)
-    SPEEXLIBDIR=`read_dirname $1`;
-    shift ;;
-  --speex-includedir=*)
-    SPEEXINCLUDEDIR=`read_dirname $1`;
-    shift ;;
-  --omp-libdir=*)
-    OMPLIBDIR=`read_dirname $1`;
-    shift ;;
-  --mathlib=*)
-    MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`;
-    shift ;;
-  --cudatk-dir=*)
-    CUDATKDIR=`read_dirname $1`;
-    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
-  --fst-version=*)
-    OPENFST_VER=`expr "X$1" : '[^=]*=\(.*\)'`;
-    shift;;
-  --host=*)
-    # This expects the same format of host "triple" as autotools based projects
-    # this script will infer the target architecture from the specified triple.
-    HOST=`expr "X$1" : '[^=]*=\(.*\)'`;
-    shift ;;
-  --android-includes=*)
-    threaded_math=false;
-    static_math=true;
-    static_fst=true;
-    dynamic_kaldi=false;
-    MATHLIB='OPENBLAS';
-    android=true;
-    ANDROIDINC=`read_dirname $1`;
-    shift;;
-  *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
-  esac
-done
-
-
-# the idea here is that if you change the configuration options from using
-# CUDA to not using it, or vice versa, we want to recompile all parts of the
-# code that may use a GPU.  Touching this file is a way to force this.
-touch cudamatrix/cu-common.h 2>/dev/null
-
-
-function add_cross_tools {
-  # If the $HOST variable is set, we need to tell make to use the specified tools
-  if [ ! -z "$HOST" ]; then
-    echo '# A host triple was specified, we need to prepend all the tools with it' >> kaldi.mk
-    echo "HOST = $HOST" >> kaldi.mk
-    echo 'CC := $(HOST)-$(CC)' >> kaldi.mk
-    echo 'CXX := $(HOST)-$(CXX)' >> kaldi.mk
-    echo 'AR := $(HOST)-$(AR)' >> kaldi.mk
-    echo 'AS := $(HOST)-$(AS)' >> kaldi.mk
-    echo 'RANLIB := $(HOST)-$(RANLIB)' >> kaldi.mk
-  fi
-}
-
 function failure {
   echo "***configure failed: $* ***" >&2
   if [ -f kaldi.mk ]; then rm kaldi.mk; fi
@@ -256,21 +137,51 @@ function check_exists {
   if [ ! -f $1 ]; then failure "$1 not found."; fi
 }
 
-function check_for_bad_gcc {
-  if [ -z "$HOST" ] ; then
-    compiler="gcc"
+function check_library {
+  local libpath=$1
+  local libname=$2
+  local libext=$3
+  local full_libname="$libpath/$libname.$libext"
+  ##echo "Testing $full_libname" >&2
+  test -f "$full_libname" && return ;
+  return 1
+}
+
+function check_compiler {
+  COMPILER=$1
+  if ! which $COMPILER >&/dev/null; then
+    failure "$COMPILER is not installed.
+             You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
   else
-    compiler="$HOST-gcc"
-  fi
-  if which $compiler >&/dev/null; then  # gcc is on the path
-    gcc_version=$($compiler -dumpspecs 2>&1 | grep -A1 -F '*version:' | grep -v version)
-    if [ "$gcc_version" == "4.8.2" ] || [ "$gcc_version" == "4.8.1" ]; then
-      echo "*** WARNING: your version of gcc seems to be 4.8.1 or 4.8.2. ***"
-      echo "*** These versions of gcc has a bug in nth_element ***"
-      echo "*** in its implementation of the standard library ***"
-      echo "*** This will cause Kaldi to crash (make test   ***"
-      echo "*** would fail). Please either upgrade or downgrade gcc. ***"
-      exit 1
+    COMPILER_VER_INFO=$($COMPILER --version 2>/dev/null)
+    if [[ $COMPILER_VER_INFO == *"g++"* ]]; then
+      GCC_VER=$($COMPILER -dumpversion)
+      GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+      if [ $GCC_VER_NUM -lt 40700 ]; then
+        failure "$COMPILER (g++-$GCC_VER) is not supported.
+                 You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      elif [ $GCC_VER_NUM  == 40801 ] || [ $GCC_VER_NUM == 40802 ]; then
+        failure "$COMPILER (g++-$GCC_VER) is not supported.
+                 GCC 4.8.1 and 4.8.2 have a bug in the implementation of
+                 the nth_element algorithm provided by the standard library.
+                 This will cause Kaldi to crash (make test would fail).
+                 Please use another C++ compiler with C++11 support.
+                 You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      fi
+    elif [[ $COMPILER_VER_INFO == *"Apple"* ]]; then
+      CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+      CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+      if [ $CLANG_VER_NUM -lt 500 ]; then
+        failure "$COMPILER (Apple clang-$CLANG_VER) is not supported.
+                 You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      fi
+    elif [[ $COMPILER_VER_INFO == *"LLVM"* ]]; then
+      CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+      CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
+      if [ $CLANG_VER_NUM -lt 303 ]; then
+        failure "$COMPILER (LLVM clang-$CLANG_VER) is not supported.
+                You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      fi
     fi
   fi
 }
@@ -291,61 +202,7 @@ function check_for_slow_expf {
   fi
 }
 
-
-function exit_success {
-  check_for_bad_gcc;
-  check_for_slow_expf;
-  echo "SUCCESS"
-  exit 0;
-}
-
-
-
-function check_library {
-  local libpath=$1
-  local libname=$2
-  local libext=$3
-  local full_libname="$libpath/$libname.$libext"
-  ##echo "Testing $full_libname" >&2
-  test -f "$full_libname" && return ;
-  return 1
-}
-
-
-#Check if at least one of these variables is set
-#If yes, we want to switch to using the MKL
-is_set $MKLLIBDIR && echo "Force-configuring KALDI to use MKL" && export MATHLIB="MKL"
-is_set $MKLROOT && echo "Force-configuring KALDI to use MKL"&& export MATHLIB="MKL"
-is_set $CLAPACKROOT && echo "Force-configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
-is_set $OPENBLASROOT && echo "Force-configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
-
-
-# If HOST is specified, parse the TARGET_ARCH, otherwise use uname -m
-if [[ "$HOST" == "" ]] ; then
-  TARGET_ARCH="`uname -m`"
-else
-  # The HOST value will be something like "armv8-rpi3-linux-gnueabihf" and we need the first value
-  # as delimited by '-' to be used as the TARGET_ARCH for this build.  The read command is the
-  # bash equivalent of split() found in other scripting languages.  read uses the value in
-  # environment variable IFS as the field delimiter.  The following command will take the
-  # host string "armv8-rpi3-linux-gnueabihf" as streamed in from the HOST variable
-  # and return ["armv8", "rpi3", "linux", "gnueabihf"] in PARTS
-  #
-  # Note that by changing the value of IFS (which is an environment variable) on the same
-  # line as the read invocation, it is only changed for that invocation and not for the shell
-  # executing this script.  So we do not need to cache and reset the value.
-  IFS='-' read -ra PARTS <<< "$HOST"
-  # We only want the first entry from the list as the architecture
-  TARGET_ARCH="$PARTS"
-  if [[ "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
-    # We currently only support building for x86[_64], arm*, and ppc64le, if the
-    # TARGET_ARCH was read from the HOST variable, it must be one of these
-    failure "$TARGET_ARCH is an unsupported architecture, kaldi currently supports x86[_64], arm*, and ppc64le"
-  fi
-fi
-
-
-#MKL functions
+# MKL functions
 function linux_configure_mkllibdir {
   local mklroot=$1
 
@@ -364,7 +221,6 @@ function linux_configure_mkl_includes {
   failure "Could not find the MKL include directory"
 }
 
-
 function linux_configure_mkl_libraries {
   local mkllibdir=$1
   local static=$2
@@ -500,13 +356,13 @@ function linux_configure_mkl_threading {
   echo "$OMP_LINK_LINE"
 }
 
-##
-## CUDA is used only in selected directories including src/cudamatrix, src/nnet*
-## and src/chain*.  It is used to accelerate the neural network training, the
-## rest of kaldi runs on CPUs.
-##
+
+# CUDA is used only in selected directories including src/cudamatrix, src/nnet*
+# and src/chain*. It is used to accelerate the neural network training.
+# The rest of Kaldi runs on CPUs.
+
 function configure_cuda {
-  #check for CUDA toolkit in the system
+  # Check for CUDA toolkit in the system
   if [ ! -d  "$CUDATKDIR" ]; then
     for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
       if [ -f $base/bin/nvcc ]; then
@@ -526,7 +382,8 @@ function configure_cuda {
 
     echo "Using CUDA toolkit $CUDATKDIR (nvcc compiler and runtime libraries)"
     echo >> kaldi.mk
-    echo "#Next section enables CUDA for compilation" >> kaldi.mk
+    echo "# CUDA configuration" >> kaldi.mk
+    echo >> kaldi.mk
     echo CUDA = true >> kaldi.mk
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
 
@@ -545,6 +402,7 @@ function configure_cuda {
       *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1 ;;
     esac
     echo "CUDA_ARCH = $CUDA_ARCH" >> kaldi.mk
+    echo >> kaldi.mk
 
     # 64bit/32bit? We do not support cross compilation with CUDA so, use direct calls to uname -m here
     if [ "`uname -m`" == "x86_64" ]; then
@@ -554,7 +412,7 @@ function configure_cuda {
         cat makefiles/cuda_64bit.mk >> kaldi.mk
       fi
     elif [ "`uname -m`" == "ppc64le" ]; then
-      cat makefiles/cuda_ppc64le.mk >> kaldi.mk
+      cat makefiles/cuda_64bit.mk >> kaldi.mk
     else
       cat makefiles/cuda_32bit.mk >> kaldi.mk
     fi
@@ -567,10 +425,10 @@ function configure_cuda {
 }
 
 function linux_configure_speex {
-  #check whether the user has called tools/extras/install_speex.sh or not
+  # Check whether the user has called tools/extras/install_speex.sh or not
   [ ! -z "$SPEEXROOT" ] || SPEEXROOT=`pwd`/../tools/speex
   [ ! -z "$SPEEXLIBDIR" ] || SPEEXLIBDIR="$SPEEXROOT"/lib
-  [ ! -z "$SPEEXINCLUDEDIR" ] || SPEEXINCLUDEDIR="$SPEEXROOT"/include
+  [ ! -z "$SPEEXINCDIR" ] || SPEEXINCDIR="$SPEEXROOT"/include
   static_speex=$1
   if [ "foo"$static_speex == "foo" ]; then
     static_speex=false
@@ -587,9 +445,9 @@ function linux_configure_speex {
     return
   fi
 
-  if [ -f $SPEEXINCLUDEDIR/speex/speex.h ]; then
+  if [ -f $SPEEXINCDIR/speex/speex.h ]; then
     echo >> kaldi.mk
-    echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXINCLUDEDIR} >> kaldi.mk
+    echo CXXFLAGS += -DHAVE_SPEEX -I${SPEEXINCDIR} >> kaldi.mk
 
     if $static_speex; then
       echo LDLIBS += $SPEEXLIBDIR/libspeex.a
@@ -604,47 +462,32 @@ function linux_configure_speex {
   fi
 }
 
-function fix_cxx_flag {
-  USINGGXX=`grep -c "CXX = g++" kaldi.mk`
-  if [ $USINGGXX -ge 1 ]; then
-    if [ -z "$HOST" ] ; then
-      CXXCOMPILER="g++"
-    else
-      CXXCOMPILER="$HOST-g++"
-    fi
-    $CXXCOMPILER -dumpversion | \
-    awk '{if(NR==1 && $1<"4.4") print "sed \"s/-Wno-unused-local-typedefs//g\" \
-    kaldi.mk > tmpf; mv tmpf kaldi.mk; "}' | sh -
+function linux_atlas_failure {
+  echo ATLASINC = $ATLASROOT/include >> kaldi.mk
+  echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
+  echo >> kaldi.mk
+  if [[ "$TARGET_ARCH" == arm* ]]; then
+    cat makefiles/linux_atlas_arm.mk >> kaldi.mk
+  elif [[ "$TARGET_ARCH" == ppc64le ]]; then
+    cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
+  else
+    cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-}
-
-function linux_atlas_failure { # function we use when we couldn't find
-   # ATLAS libs.
-   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
-   echo ATLASLIBS = [somewhere]/liblapack.a [somewhere]/libcblas.a [somewhere]/libatlas.a [somewhere]/libf77blas.a $ATLASLIBDIR >> kaldi.mk
-   if [[ "$TARGET_ARCH" == arm* ]]; then
-     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
-   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
-     cat makefiles/linux_atlas_ppc64le.mk >> kaldi.mk
-   else
-     cat makefiles/linux_atlas.mk >> kaldi.mk
-   fi
-   fix_cxx_flag
-   echo "** $* ***"
-   echo "**  ERROR   **"
-   echo "** Configure cannot proceed automatically."
-   echo "**  If you know that you have ATLAS installed somewhere on your machine, you"
-   echo "** may be able to proceed by replacing [somewhere] in kaldi.mk with a directory."
-   echo "**  If you have sudo (root) access you could install the ATLAS package on your"
-   echo "** machine, e.g. 'sudo apt-get install libatlas-dev libatlas-base-dev' or"
-   echo "** 'sudo yum install atlas.x86_64' or 'sudo zypper install libatlas3-devel',"
-   echo "** or on cygwin, install atlas from the installer GUI; and then run ./configure"
-   echo "** again."
-   echo "**"
-   echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
-   echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
-   echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
-   exit 1;
+  echo "** $* ***"
+  echo "**  ERROR   **"
+  echo "** Configure cannot proceed automatically."
+  echo "**  If you know that you have ATLAS installed somewhere on your machine, you"
+  echo "** may be able to proceed by replacing [somewhere] in kaldi.mk with a directory."
+  echo "**  If you have sudo (root) access you could install the ATLAS package on your"
+  echo "** machine, e.g. 'sudo apt-get install libatlas-dev libatlas-base-dev' or"
+  echo "** 'sudo yum install atlas.x86_64' or 'sudo zypper install libatlas3-devel',"
+  echo "** or on cygwin, install atlas from the installer GUI; and then run ./configure"
+  echo "** again."
+  echo "**"
+  echo "**  Otherwise (or if you prefer OpenBLAS for speed), you could go the OpenBLAS"
+  echo "** route: cd to ../tools, type 'extras/install_openblas.sh', cd back to here,"
+  echo "** and type './configure  --openblas-root=../tools/OpenBLAS/install'"
+  exit 1;
 }
 
 function linux_check_static {
@@ -654,12 +497,7 @@ function linux_check_static {
   if [ -f $dir/libatlas.a ]; then # candidate...
     # Note: on the next line, the variable assignment
     # LANG=en_US should apply just to the program called on that line.
-    if [ -z "$HOST" ] ; then
-      compiler="gcc"
-    else
-      compiler="$HOST-gcc"
-    fi
-    if LANG=en_US $compiler -o test_linking test_linking.cc -u ATL_flushcache $dir/libatlas.a 2>&1 | grep -i "incompatible" >/dev/null; then
+    if LANG=en_US $CXX -o test_linking test_linking.cc -u ATL_flushcache $dir/libatlas.a 2>&1 | grep -i "incompatible" >/dev/null; then
       echo "Directory $dir may contain ATLAS libraries but seems to be wrong architecture";
       rm test_linking test_linking.cc 2>/dev/null
       return 1;
@@ -684,6 +522,7 @@ function linux_configure_debian_ubuntu {
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
    elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -691,12 +530,9 @@ function linux_configure_debian_ubuntu {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
-  exit_success;
 }
 
 function linux_configure_debian_ubuntu3 {
@@ -710,6 +546,7 @@ function linux_configure_debian_ubuntu3 {
   fi
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -717,12 +554,9 @@ function linux_configure_debian_ubuntu3 {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
-  exit_success;
 }
 
 function linux_configure_debian7 {
@@ -738,7 +572,7 @@ function linux_configure_debian7 {
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_debian7" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -746,12 +580,9 @@ function linux_configure_debian7 {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
   linux_configure_speex
-  exit_success;
 }
 
 function linux_configure_redhat {
@@ -764,7 +595,7 @@ function linux_configure_redhat {
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -772,11 +603,8 @@ function linux_configure_redhat {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
-  exit_success;
 }
 
 function linux_configure_redhat_fat {
@@ -792,7 +620,7 @@ function linux_configure_redhat_fat {
   [ -z "$libdir" ] && echo "Error getting libdir in linux_configure_redhat_fat" && exit 1;
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS -Wl,-rpath=$libdir >> kaldi.mk
-  echo
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -800,14 +628,10 @@ function linux_configure_redhat_fat {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
   $use_cuda && configure_cuda
-  exit_success;
 }
 
-
 function linux_configure_static {
   if $threaded_atlas; then pt=pt; else pt=""; fi
 
@@ -853,6 +677,7 @@ function linux_configure_static {
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -860,12 +685,9 @@ function linux_configure_static {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
-  exit_success;
 }
 
 function linux_check_dynamic {
@@ -939,6 +761,7 @@ function linux_configure_dynamic {
 
   echo ATLASINC = $ATLASROOT/include >> kaldi.mk
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
+  echo >> kaldi.mk
   if [[ "$TARGET_ARCH" == arm* ]]; then
     cat makefiles/linux_atlas_arm.mk >> kaldi.mk
   elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -946,182 +769,382 @@ function linux_configure_dynamic {
   else
     cat makefiles/linux_atlas.mk >> kaldi.mk
   fi
-  fix_cxx_flag
-  add_cross_tools
   $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  exit_success;
 }
 
-echo "Configuring ..."
+#############################    CONFIGURATION    #############################
 
-if [ ! -f makefiles/common.mk ]; then
-    failure makefiles/common.mk not found
-fi
+# If configuration sets any of these variables, we will switch the external
+# math library. Here we unset them so that we can check later.
+unset MKLROOT
+unset CLAPACKROOT
+unset OPENBLASROOT
+unset MKLLIBDIR
 
+# This variable identifies the type of system where built programs and
+# libraries will run. It is set by the configure script when cross compiling.
+unset HOST
 
-echo "Checking OpenFST library in $FSTROOT ..."
-if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
-    failure "Could not find file $FSTROOT/include/fst/fst.h:
-    you may not have installed OpenFst.  See ../tools/INSTALL"
+# These environment variables can be used to override the default toolchain.
+CXX=${CXX:-g++}
+AR=${AR:-ar}
+AS=${AS:-as}
+RANLIB=${RANLIB:-ranlib}
+
+# These environment variables can be used to provide additional flags to the
+# compiler/linker. We want these flags to override the flags determined by the
+# configure script, so we append them to the appropriate variables (CXXFLAGS,
+# LDFLAGS and LDLIBS) after those variables are set by the configure script.
+ENV_CXXFLAGS=$CXXFLAGS
+ENV_LDFLAGS=$LDFLAGS
+ENV_LDLIBS=$LDLIBS
+
+# Default configuration
+double_precision=false
+dynamic_kaldi=false
+use_cuda=true
+static_fst=false
+static_math=false
+threaded_atlas=false
+mkl_threading=sequential
+android=false
+
+MATHLIB='ATLAS'
+ATLASROOT=`rel2abs ../tools/ATLAS/`
+FSTROOT=`rel2abs ../tools/openfst`
+
+# Save the command line to include in kaldi.mk
+cmd_line="$0 $@"
+
+while [ $# -gt 0 ];
+do
+  case "$1" in
+  --help)
+    usage; exit 0 ;;
+  --version)
+    echo $CONFIGURE_VERSION; exit 0 ;;
+  --static)
+    dynamic_kaldi=false;
+    static_math=true;
+    static_fst=true;
+    shift ;;
+  --shared)
+    dynamic_kaldi=true;
+    static_math=false;
+    static_fst=false;
+    shift ;;
+  --double-precision)
+    double_precision=true;
+    shift ;;
+  --double-precision=yes)
+    double_precision=true;
+    shift ;;
+  --double-precision=no)
+    double_precision=false;
+    shift ;;
+  --atlas-root=*)
+    ATLASROOT=`read_dirname $1`;
+    shift ;;
+  --threaded-atlas)
+    threaded_atlas=true;
+    shift ;;
+  --threaded-atlas=yes)
+    threaded_atlas=true;
+    shift ;;
+  --threaded-atlas=no)
+    threaded_atlas=false;
+    shift ;;
+  --threaded-math)
+    threaded_atlas=true;
+    mkl_threading=iomp
+    shift ;;
+  --threaded-math=yes)
+    threaded_atlas=true;
+    mkl_threading=iomp
+    shift ;;
+  --threaded-math=no)
+    threaded_atlas=false;
+    mkl_threading=sequential
+    shift ;;
+  --use-cuda)
+    use_cuda=true;
+    shift ;;
+  --use-cuda=yes)
+    use_cuda=true;
+    shift ;;
+  --use-cuda=no)
+    use_cuda=false;
+    shift ;;
+  --static-math)
+    static_math=true;
+    shift ;;
+  --static-math=yes)
+    static_math=true;
+    shift ;;
+  --static-math=no)
+    static_math=false;
+    shift ;;
+  --static-fst)
+    static_fst=true;
+    shift ;;
+  --static-fst=yes)
+    static_fst=true;
+    shift ;;
+  --static-fst=no)
+    static_fst=false;
+    shift ;;
+  --mkl-threading=sequential)
+    threaded_atlas=false;
+    mkl_threading=sequential;
+    shift ;;
+  --mkl-threading=*)
+    mkl_threading=`expr "X$1" : '[^=]*=\(.*\)'`;
+    threaded_atlas=true;
+    shift ;;
+  --fst-root=*)
+    FSTROOT=`read_dirname $1`;
+    shift ;;
+  --clapack-root=*)
+    CLAPACKROOT=`read_dirname $1`;
+    shift ;;
+  --openblas-root=*)
+    OPENBLASROOT=`read_dirname $1`;
+    shift ;;
+  --mkl-root=*)
+    MKLROOT=`read_dirname $1`;
+    shift ;;
+  --mkl-libdir=*)
+    MKLLIBDIR=`read_dirname $1`;
+    shift ;;
+  --speex-root=*)
+    SPEEXROOT=`read_dirname $1`;
+    shift ;;
+  --speex-libdir=*)
+    SPEEXLIBDIR=`read_dirname $1`;
+    shift ;;
+  --speex-incdir=*)
+    SPEEXINCDIR=`read_dirname $1`;
+    shift ;;
+  --omp-libdir=*)
+    OMPLIBDIR=`read_dirname $1`;
+    shift ;;
+  --mathlib=*)
+    MATHLIB=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift ;;
+  --cudatk-dir=*)
+    CUDATKDIR=`read_dirname $1`;
+    shift ;; #CUDA is used in src/cudamatrix and src/nnet{,bin} only
+  --fst-version=*)
+    OPENFST_VER=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift;;
+  --host=*)
+    # The type of system where built programs and libraries will run.
+    # It should be in the format cpu-vendor-os. If specified, this script
+    # will infer the target architecture from the specified host triple.
+    HOST=`expr "X$1" : '[^=]*=\(.*\)'`;
+    shift ;;
+  --android-incdir=*)
+    android=true;
+    threaded_math=false;
+    static_math=true;
+    static_fst=true;
+    dynamic_kaldi=false;
+    MATHLIB='OPENBLAS';
+    ANDROIDINC=`read_dirname $1`;
+    shift;;
+  *)  echo "Unknown argument: $1, exiting"; usage; exit 1 ;;
+  esac
+done
+
+# The idea here is that if you change the configuration options from using
+# CUDA to not using it, or vice versa, we want to recompile all parts of the
+# code that may use a GPU. Touching this file is a way to force this.
+touch cudamatrix/cu-common.h 2>/dev/null
+
+if $android && [[ "$CXX" != *clang++*  ]] ; then
+  failure "Android build requires clang++. Make sure you have clang++ installed
+  on your system and then override the default compiler by setting CXX, e.g.
+  CXX=clang++ ./configure"
 fi
-echo Checking OpenFst library was patched.
-if ! grep "multiple repeated" $FSTROOT/include/fst/minimize.h >/dev/null; then
-    echo "**  ERROR  **"
-    echo "** $FSTROOT/include/fst/minimize.h seems not to be patched:"
-    echo "patch not applied?  FST tools will not work in our recipe."
-    exit 1;
+
+# If HOST is set
+# 1. We prepend it to CXX, AR, AS and RANLIB.
+# 2. We parse the target architecture from the HOST triple.
+# Otherwise we set the target architecture to the output of `uname -m`.
+if is_set $HOST; then
+  CXX="$HOST-$CXX"
+  AR="$HOST-$AR"
+  AS="$HOST-$AS"
+  RANLIB="$HOST-$RANLIB"
+
+  # The host triple will be something like "armv8-rpi3-linux-gnueabihf". We
+  # need the first field which is the target architecture for this build. The
+  # following command will take the host triple "armv8-rpi3-linux-gnueabihf"
+  # and return ["armv8", "rpi3", "linux", "gnueabihf"] in PARTS.
+  IFS='-' read -ra PARTS <<< "$HOST"
+  # The first field in the PARTS list is the target architecture.
+  TARGET_ARCH="$PARTS"
+  if [[ "$TARGET_ARCH" != arm* && "$TARGET_ARCH" != ppc64le && "$TARGET_ARCH" != x86* ]] ; then
+    # We currently only support building for x86[_64], arm*, and ppc64le.
+    # If TARGET_ARCH was read from the HOST variable, it must be one of these.
+    failure "$TARGET_ARCH is not a supported architecture.
+             Supported architectures: x86[_64], arm*, ppc64le."
+  fi
+else
+  TARGET_ARCH="`uname -m`"
 fi
 
-# back up the old one in case we modified it
+# If one of these variables is set, we switch the external math library.
+is_set $MKLLIBDIR && echo "Configuring KALDI to use MKL" && export MATHLIB="MKL"
+is_set $MKLROOT && echo "Configuring KALDI to use MKL"&& export MATHLIB="MKL"
+is_set $CLAPACKROOT && echo "Configuring KALDI to use CLAPACK"&& export MATHLIB="CLAPACK"
+is_set $OPENBLASROOT && echo "Configuring KALDI to use OPENBLAS"&& export MATHLIB="OPENBLAS"
+
+echo "Configuring ..."
+
+# Back up the old kaldi.mk in case we modified it
 if [ -f kaldi.mk ]; then
-  echo "Backing up kaldi.mk to kaldi.mk.bak"
+  echo "Backing up kaldi.mk to kaldi.mk.bak ..."
   cp kaldi.mk kaldi.mk.bak
 fi
 
-printf "# This file was generated using the following command:\n# $cmd_line\n\n" > kaldi.mk
-cat makefiles/common.mk >> kaldi.mk
+# Generate the new kaldi.mk file
+echo "# This file was generated using the following command:" > kaldi.mk
+echo "# $cmd_line" >> kaldi.mk
+echo >> kaldi.mk
+echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
+echo >> kaldi.mk
+
+echo "# Toolchain configuration" >> kaldi.mk
+echo >> kaldi.mk
+echo "CXX = $CXX" >> kaldi.mk
+echo "AR = $AR" >> kaldi.mk
+echo "AS = $AS" >> kaldi.mk
+echo "RANLIB = $RANLIB" >> kaldi.mk
+echo >> kaldi.mk
+
+echo "Checking compiler $CXX ..."
+check_compiler $CXX
+
+echo "# Base configuration" >> kaldi.mk
+echo >> kaldi.mk
 if $dynamic_kaldi ; then
-KALDILIBDIR=`pwd`/lib
-echo "KALDI_FLAVOR := dynamic" >> kaldi.mk
-echo "KALDILIBDIR := $KALDILIBDIR" >> kaldi.mk
+  KALDILIBDIR=`pwd`/lib
+  echo "KALDI_FLAVOR := dynamic" >> kaldi.mk
+  echo "KALDILIBDIR := $KALDILIBDIR" >> kaldi.mk
 fi
-echo "CONFIGURE_VERSION := $CONFIGURE_VERSION" >> kaldi.mk
-echo "FSTROOT = $FSTROOT" >> kaldi.mk
-
-# Check installed OpenFst version and add C++11 flags if OpenFst >= 1.4
-OPENFST_VER="${OPENFST_VER:-`grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::'`}"
-echo "OPENFST_VER = $OPENFST_VER" >> kaldi.mk
-OPENFST_VER_NUM=`echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d"`
-if [ $OPENFST_VER_NUM -ge 10400 ]; then
-  echo "OPENFST_GE_10400 = 1" >> kaldi.mk
-  echo "EXTRA_CXXFLAGS += -DHAVE_OPENFST_GE_10400 -std=c++0x" >> kaldi.mk
+if $double_precision; then
+  echo "DOUBLE_PRECISION = 1" >> kaldi.mk
 else
-  echo "OPENFST_GE_10400 = 0" >> kaldi.mk
+  echo "DOUBLE_PRECISION = 0" >> kaldi.mk
+fi
+echo "Checking OpenFst library in $FSTROOT ..."
+if [ ! -f $FSTROOT/include/fst/fst.h  ]; then
+  failure "Could not find file $FSTROOT/include/fst/fst.h:
+  you may not have installed OpenFst. See ../tools/INSTALL"
+fi
+OPENFST_VER=${OPENFST_VER:-$(grep 'PACKAGE_VERSION' $FSTROOT/Makefile | sed -e 's:.*= ::')}
+OPENFST_VER_NUM=$(echo $OPENFST_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+if [ $OPENFST_VER_NUM -lt 10600 ]; then
+  failure "OpenFst-$OPENFST_VER is not supported. You need OpenFst >= 1.6.0.)"
+fi
+echo "OPENFSTINC = $FSTROOT/include" >> kaldi.mk
+if $static_fst ; then
+  OPENFSTLIBS="$FSTROOT/lib/libfst.a"
+else
+  if [ "`uname`" == "Darwin"  ]; then
+    OPENFSTLIBS="$FSTROOT/lib/libfst.dylib"
+    OPENFSTLDFLAGS="-Wl,-rpath -Wl,${FSTROOT}/lib"
+  elif [ "`uname`" == "Linux" ]; then
+    OPENFSTLIBS="$FSTROOT/lib/libfst.so"
+    OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
+  else
+    failure "Dynamic libraries are not supported on this platform.
+             Run configure with --static --static-fst=no flag."
+  fi
+fi
+if [ ! -f "$OPENFSTLIBS" ]; then
+  failure "Static=[$static_fst] OpenFST library not found:  See ../tools/INSTALL"
 fi
+echo "OPENFSTLIBS = $OPENFSTLIBS" >> kaldi.mk
+echo "OPENFSTLDFLAGS = $OPENFSTLDFLAGS" >> kaldi.mk
+echo >> kaldi.mk
 
-# Most of the OS-specific steps below will append to kaldi.mk
+# OS-specific steps given below append to kaldi.mk
 echo "Doing OS specific configurations ..."
 
 if $android ; then
-  OPENFSTLIBS="$FSTROOT/lib/libfst.a"
-  echo "OPENFSTLIBS = $OPENFSTLIBS" >> kaldi.mk
-
   if [ -z $ANDROIDINC ] ;  then
-    failure "--android-includes must be specified for android builds"
+    failure "--android-incdir must be specified for android builds."
   fi
 
-  if [ -z $HOST ] ; then
-    failure "HOST must be specified for android builds"
+  if ! is_set $HOST; then
+    failure "HOST must be specified for android builds."
   fi
 
   OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
   if [ -z "$OPENBLASROOT" ]; then
-    failure "Must specify the location of OPENBLAS with --openblas-root option (and it must exist)"
+    failure "The location of OPENBLAS must be specified for android builds
+             using --openblas-root (and it must exist)"
   fi
   if [ ! -f $OPENBLASROOT/lib/libopenblas.a ]; then
     failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.a"
   fi
-  echo "Your math library seems to be OpenBLAS.  Configuring appropriately."
+  echo "Using OpenBLAS as the linear algebra library."
 
   OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a $OPENBLASROOT/lib/libclapack.a $OPENBLASROOT/lib/liblapack.a $OPENBLASROOT/lib/libblas.a $OPENBLASROOT/lib/libf2c.a"
-  echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
+  echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
   echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
   echo "ANDROIDINC = $ANDROIDINC" >> kaldi.mk
 
   cat makefiles/android_openblas.mk >> kaldi.mk
 
-  add_cross_tools
-
-  echo "Successfully configured OpenBLAS from $OPENBLASROOT."
-  echo "Configuration succeeded for platform Android"
-  exit_success
-fi
+  echo "Successfully configured for Android with OpenBLAS from $OPENBLASROOT."
 
+elif [ "`uname`" == "Darwin" ]; then
+  # Check for Darwin first, because we later call uname -o (for Cygwin)
+  # which crashes on Darwin.
 
-# Check for Darwin at first, because we later call uname -o (for Cygwin)
-# which crashes on Darwin. Also the linear algebra libraries on Macs are
-# used differently (through the Accelerate framework) than on Linux.
-if [ "`uname`" == "Darwin"  ]; then
- $use_cuda && configure_cuda
-  echo "On Darwin: checking for Accelerate framework ..."
+  echo "On Darwin: Checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
-    failure "Need the Accelerate.framework to compile on Darwin."
+    failure "Need the Accelerate framework to compile on Darwin."
   fi
-  if [ ! -f $FSTROOT/lib/libfst.a ]; then
-    failure "Static OpenFST library not found:  See ../tools/INSTALL"
+  OSX_VER=$(sw_vers | grep ProductVersion | awk '{print $2}' | awk '{split($0,a,"."); print a[1] "." a[2]; }')
+  OSX_VER_NUM=$(echo $OSX_VER | sed 's/\./ /g' | xargs printf "%d%02d")
+  echo "Configuring for OS X version $OSX_VER ..."
+  if [ $OSX_VER_NUM -ge 1005 ]; then
+    cat makefiles/darwin.mk >> kaldi.mk
+  else
+    failure "Mac OS X version '$OSX_VER' is not supported."
   fi
-  # posix_memalign and gcc -rdynamic options not present on OS X 10.5.*
-  osx_ver=`sw_vers | grep ProductVersion | awk '{print $2}' | awk '{split($0,a,"."); print a[1] "." a[2]; }'`
-  echo "Configuring for OS X version $osx_ver ..."
-  if [ "$osx_ver" == "10.5" ]; then
-    check_exists makefiles/darwin_10_5.mk
-    cat makefiles/darwin_10_5.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.6" ]; then
-    check_exists makefiles/darwin_10_6.mk
-    cat makefiles/darwin_10_6.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.7" ]; then
-    check_exists makefiles/darwin_10_7.mk
-    cat makefiles/darwin_10_7.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.8" ]; then
-    check_exists makefiles/darwin_10_8.mk
-    cat makefiles/darwin_10_8.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.9" ]; then
-    check_exists makefiles/darwin_10_9.mk
-    cat makefiles/darwin_10_9.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.10" ]; then
-    check_exists makefiles/darwin_10_10.mk
-    cat makefiles/darwin_10_10.mk >> kaldi.mk
-  elif [ "$osx_ver" == "10.11" ]; then
-    check_exists makefiles/darwin_10_11.mk
-    cat makefiles/darwin_10_11.mk >> kaldi.mk
+
+  if [ $OSX_VER_NUM == 1011 ]; then
     echo "**BAD WARNING**: You are using OS X El Capitan.  Some versions of this OS"
     echo "**BAD WARNING**: have a bug in the BLAS implementation that affects Kaldi."
     echo "**BAD WARNING**: After compiling, cd to matrix/ and type 'make test'.  The"
     echo "**BAD WARNING**: test will fail if the problem exists in your version. "
     echo "**BAD WARNING**: Eventually this issue will be fixed by system updates from"
-    echo "**BAD WARNING**  Apple.  Unexplained crashes with reports of NaNs will"
-    echo "**BAD WARNING**  be caused by this bug, but some recipes will (sometimes) work."
+    echo "**BAD WARNING**: Apple.  Unexplained crashes with reports of NaNs will"
+    echo "**BAD WARNING**: be caused by this bug, but some recipes will (sometimes) work."
     sleep 1; echo -n .; sleep 1; echo -n .; sleep 1; echo .
-  elif [ "$osx_ver" == "10.12" ]; then
-    check_exists makefiles/darwin_10_12.mk
-    cat makefiles/darwin_10_12.mk >> kaldi.mk
-  else
-    failure "OS X version '$osx_ver' not supported"
   fi
-  echo "Configuration succeeded for platform Darwin."
-  exit_success;
-fi
-
-if [ "`uname -o`" == "Cygwin"  ]; then
-    echo "On Cygwin: checking for linear algebra libraries ..."
-    if [ ! -f ../tools/CLAPACK/clapack.h ]; then
-        failure "could not find file ../tools/CLAPACK/clapack.h"
-    fi
-    if [ ! -f /usr/lib/lapack/cygblas-0.dll ]; then
-       failure "please first install package liblapack0"
-    fi
-    cat makefiles/cygwin.mk >> kaldi.mk
-    echo "Configuration succeeded for platform cygwin"
-    exit_success;
-fi
+  echo "Successfully configured for Darwin with Accelerate framework."
+  $use_cuda && configure_cuda
 
-if [ "`uname`" == "Linux" ]; then
-  if  $static_fst ; then
-      OPENFSTLIBS="$FSTROOT/lib/libfst.a"
-      fst_type='a'
-  else
-      OPENFSTLIBS="-L${FSTROOT}/lib -lfst"
-      OPENFSTLDFLAGS="-Wl,-rpath=${FSTROOT}/lib"
-      fst_type='so'
+elif [ "`uname -o`" == "Cygwin"  ]; then
+  echo "On Cygwin: Checking for linear algebra libraries ..."
+  if [ ! -f ../tools/CLAPACK/clapack.h ]; then
+      failure "could not find file ../tools/CLAPACK/clapack.h"
   fi
-  if [ ! -f "$FSTROOT/lib/libfst.${fst_type}" ]; then
-    failure "Static=[$static_fst] OpenFST library not found:  See ../tools/INSTALL"
+  if [ ! -f /usr/lib/lapack/cygblas-0.dll ]; then
+     failure "please first install package liblapack0"
   fi
-  echo OPENFSTLIBS = $OPENFSTLIBS >> kaldi.mk
-  echo OPENFSTLDFLAGS = $OPENFSTLDFLAGS >> kaldi.mk
+  cat makefiles/cygwin.mk >> kaldi.mk
+  echo "Successfully configured for Cygwin with CLAPACK."
 
+elif [ "`uname`" == "Linux" ]; then
   echo "On Linux: Checking for linear algebra header files ..."
   if [ "$MATHLIB" == "ATLAS" ]; then
     if [ ! -f $ATLASROOT/include/cblas.h ] || [ ! -f $ATLASROOT/include/clapack.h ] ; then
@@ -1152,7 +1175,7 @@ if [ "`uname`" == "Linux" ]; then
         linux_configure_redhat || \
         linux_configure_redhat_fat 64 || \
         linux_configure_redhat_fat || \
-        linux_atlas_failure "Failed to configure ATLAS lbiraries";
+        linux_atlas_failure "Failed to configure ATLAS libraries";
     else
       # Prefer dynamic to static math.
       linux_configure_debian_ubuntu3 || \
@@ -1165,7 +1188,7 @@ if [ "`uname`" == "Linux" ]; then
         linux_configure_redhat || \
         linux_configure_redhat_fat 64 || \
         linux_configure_redhat_fat || \
-        linux_atlas_failure "Failed to configure ATLAS lbiraries";
+        linux_atlas_failure "Failed to configure ATLAS libraries";
     fi
 
   elif [ "$MATHLIB" == "MKL" ]; then
@@ -1210,28 +1233,26 @@ if [ "`uname`" == "Linux" ]; then
     if [ ! -z $MKLLIBDIR ]; then
       echo MKLLIB = $MKLLIBDIR >> kaldi.mk
     fi
+    echo >> kaldi.mk
     check_exists makefiles/linux_x86_64_mkl.mk
     cat makefiles/linux_x86_64_mkl.mk >> kaldi.mk
-    fix_cxx_flag
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
-
+    echo "Successfully configured for Linux with MKL libs from $MKLROOT"
     $use_cuda && configure_cuda
     linux_configure_speex
-    echo "Successfully configured for Linux with MKL libs from $MKLROOT"
-    exit_success;
 
   elif [ "$MATHLIB" == "CLAPACK" ]; then
     if [ -z "$CLAPACKROOT" ]; then
       failure "Must specify the location of CLAPACK with --clapack-root option (and it must exist)"
     fi
     if [ ! -f ../tools/CLAPACK/clapack.h ]; then
-      failure could not find file ../tools/CLAPACK/clapack.h
+      failure "could not find file ../tools/CLAPACK/clapack.h"
     fi
     if [ ! -d "$CLAPACKROOT" ]; then
       failure "The directory $CLAPACKROOT does not exist"
     fi
     # Also check for cblas.h and f2c.h
-    echo "Using CLAPACK as the linear algebra library."
+    echo "Using CLAPACK libs from $CLAPACKROOT as the linear algebra library."
     if [ ! -f makefiles/linux_clapack.mk ]; then
       failure "makefiles/linux_clapack.mk not found."
     fi
@@ -1240,13 +1261,11 @@ if [ "`uname`" == "Linux" ]; then
     else
       cat makefiles/linux_clapack.mk >> kaldi.mk
     fi
-    fix_cxx_flag
-    add_cross_tools
-    echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work."
+    echo "Warning (CLAPACK): this part of the configure process is not properly tested and may not work."
+    echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
     $use_cuda && configure_cuda
     linux_configure_speex
-    echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
-    exit_success;
+
   elif [ "$MATHLIB" == "OPENBLAS" ]; then
     OPENBLASROOT=`rel2abs "$OPENBLASROOT"`
     if [ -z "$OPENBLASROOT" ]; then
@@ -1255,7 +1274,7 @@ if [ "`uname`" == "Linux" ]; then
     if [ ! -f $OPENBLASROOT/lib/libopenblas.so ]; then
       failure "Expected to find the file $OPENBLASROOT/lib/libopenblas.so"
     fi
-    echo "Your math library seems to be OpenBLAS.  Configuring appropriately."
+    echo "Your math library seems to be OpenBLAS from $OPENBLASROOT.  Configuring appropriately."
     if $static_math; then
       echo "Configuring static OpenBlas since --static-math=yes"
       OPENBLASLIBS="$OPENBLASROOT/lib/libopenblas.a -lgfortran"
@@ -1263,8 +1282,9 @@ if [ "`uname`" == "Linux" ]; then
       echo "Configuring dynamically loaded OpenBlas since --static-math=no (the default)"
       OPENBLASLIBS="-L$OPENBLASROOT/lib -lopenblas -lgfortran -Wl,-rpath=$OPENBLASROOT/lib"
     fi
+    echo "OPENBLASINC = $OPENBLASROOT/include" >> kaldi.mk
     echo "OPENBLASLIBS = $OPENBLASLIBS" >> kaldi.mk
-    echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
+    echo >> kaldi.mk
     if [[ "$TARGET_ARCH" == arm* ]]; then
       cat makefiles/linux_openblas_arm.mk >> kaldi.mk
     elif [[ "$TARGET_ARCH" == ppc64le ]]; then
@@ -1272,15 +1292,31 @@ if [ "`uname`" == "Linux" ]; then
     else
       cat makefiles/linux_openblas.mk >> kaldi.mk
     fi
-    fix_cxx_flag
-    add_cross_tools
+    echo "Successfully configured for Linux with OpenBLAS from $OPENBLASROOT"
     $use_cuda && configure_cuda
     linux_configure_speex
-    echo "Successfully configured OpenBLAS from $OPENBLASROOT."
-    exit_success;
+
   else
     failure "Unsupported linear algebra library '$MATHLIB'"
   fi
+else
+  failure "Could not detect the platform or we have not yet worked out the
+  appropriate configuration for this platform. Please contact the developers."
 fi
 
-failure Could not detect platform or we have not yet worked out the appropriate configuration for this platform.  Please contact the developers.
+# Append the flags set by environment variables last so they can be used
+# to override the automatically generated configuration.
+echo >> kaldi.mk
+echo "# Environment configuration" >> kaldi.mk
+echo >> kaldi.mk
+if [ -n "$ENV_CXXFLAGS" ]; then echo "CXXFLAGS += $ENV_CXXFLAGS" >> kaldi.mk; fi
+if [ -n "$ENV_LDFLAGS" ]; then echo "LDFLAGS += $ENV_LDFLAGS" >> kaldi.mk; fi
+if [ -n "$ENV_LDLIBS" ]; then echo "LDLIBS += $ENV_LDLIBS" >> kaldi.mk; fi
+
+# We check for slow exp implementation just before we exit. This check uses
+# and possibly modifies the kaldi.mk file that we just generated.
+check_for_slow_expf;
+echo "SUCCESS"
+echo "To compile: make clean -j; make depend -j; make -j"
+echo " ... or e.g. -j 10, instead of -j, to use a specified number of CPUs"
+exit 0;
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index b10601b8245..c6500e95559 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -141,7 +141,7 @@ class CuMemoryAllocator {
       // be a multiple of 4, and num_rows will frequently be a multiple of
       // powers of 2 also.  We need to shift right and add so that there will be
       // some action in the lower-order bits.
-      size_t operator () (const std::pair<size_t,size_t> &p) const {
+      size_t operator () (const std::pair<size_t,size_t> &p) const noexcept {
         size_t temp = p.first + 1867 * p.second;
         return temp + (temp >> 2) + (temp >> 8);
       }
@@ -206,7 +206,7 @@ class CuMemoryAllocator {
   };
 
   struct PointerHasher {
-    size_t operator() (const void *arg) const {
+    size_t operator() (const void *arg) const noexcept {
       // the last few bits tend to be very predictable, for alignment reasons (CUDA
       // allocation may align on 256 byte or 512 byte boundaries or something similar).
       size_t temp = reinterpret_cast<size_t>(arg);
diff --git a/src/cudamatrix/cu-array-test.cc b/src/cudamatrix/cu-array-test.cc
index f3ebcb72ee0..863ca5dde18 100644
--- a/src/cudamatrix/cu-array-test.cc
+++ b/src/cudamatrix/cu-array-test.cc
@@ -116,8 +116,9 @@ static void UnitTestCuArray() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -134,8 +135,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-block-matrix-test.cc b/src/cudamatrix/cu-block-matrix-test.cc
index 4193e61c609..387749904b1 100644
--- a/src/cudamatrix/cu-block-matrix-test.cc
+++ b/src/cudamatrix/cu-block-matrix-test.cc
@@ -181,8 +181,9 @@ template<typename Real> void CuBlockMatrixUnitTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -200,12 +201,13 @@ int main() {
 #else
     kaldi::CuBlockMatrixUnitTest<double>();
 #endif
+
     if (loop == 0)
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-device-test.cc b/src/cudamatrix/cu-device-test.cc
index ec0fa7b1f9f..8f44985ede0 100644
--- a/src/cudamatrix/cu-device-test.cc
+++ b/src/cudamatrix/cu-device-test.cc
@@ -99,8 +99,8 @@ void CudaMatrixResizeTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
+  for (int32 loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -118,9 +118,10 @@ int main() {
 #else
     kaldi::CudaMatrixResizeTest<double>();
 #endif
-  }
+
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 116428ea82c..444da38dd30 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -30,677 +30,647 @@
 #if HAVE_CUDA == 1
 extern "C" {
 
-/*********************************************************
- * int32 CUDA kernel calls (no template wrapper)
- */
-void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
-                          MatrixDim d);
-void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
-                    MatrixDim d);
-
-/*********************************************************
- * float CUDA kernel calls
- */
-
-/*
- * CuMatrix
- */
-void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
-void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
-void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
-                            MatrixDim mat_dim, const float *vec,
-                            const float *mat2, int mat2_row_stride,
-                            int mat2_col_stride, float beta);
-void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
-                              MatrixDim dmat);
-void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
-                               MatrixDim dmat);
-void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
-                        MatrixDim dmat);
-void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
-                         MatrixDim dmat);
-void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
-void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                         bool include_sign, MatrixDim d);
-void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
-void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                       MatrixDim d);
-void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                     int src_stride);
-void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta);
+void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta);
+void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
-void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                     int src_stride);
-void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl, float* dst,
-                            const float* const * src, MatrixDim dst_dim);
-void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl, float* const * dst,
-                               const float* src, MatrixDim src_dim);
-void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
+void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
-void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                           const float* const * src, MatrixDim dst_dim);
-void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
-                              const float* src, MatrixDim src_dim);
-void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
-                         MatrixDim d);
-void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
-void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
+                               const double* M, const int strid_M,
+                               const double* N, const MatrixDim dim_N,
+                               const double beta, double* v);
+void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
+                               const float* M, const int strid_M,
+                               const float* N, const MatrixDim dim_N,
+                               const float beta, float* v);
+void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
+                                const double* M, const MatrixDim dim_M,
+                                const double* N, const int stride_N,
+                                const double beta, double* v);
+void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
+                                const float* M, const MatrixDim dim_M,
+                                const float* N, const int stride_N,
+                                const float beta, float* v);
+void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
+                                const double* M, const int strid_M,
+                                const double* N, const MatrixDim dim_N,
+                                const double beta, double* v);
+void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
+                                const float* M, const int strid_M,
+                                const float* N, const MatrixDim dim_N,
+                                const float beta, float* v);
+void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
 void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
-void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
-void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                            MatrixDim mat_dim, const double *vec,
+                            const double *mat2, int mat2_row_stride,
+                            int mat2_col_stride, double beta);
+void cudaF_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
+                            MatrixDim mat_dim, const float *vec,
+                            const float *mat2, int mat2_row_stride,
+                            int mat2_col_stride, float beta);
+void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
 void cudaF_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
-void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec,
-                    const float alpha, int dim);
-void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
-void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
-void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
-void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                        MatrixDim dst_d, int src_stride);
-void cudaF_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                        MatrixDim dst_d, int src_stride);
-void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d,
-               int src_stride);
-void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                        MatrixDim d);
-void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                        MatrixDim d);
-void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
-                              MatrixDim d, int src_stride, int group_size);
-void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
-                            const float *ov, const float* od, MatrixDim id_dim,
-                            int iv_stride, int ov_stride, int od_stride,
-                            int group_size, float power);
-void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
-                                const float *x2, MatrixDim y_dim, int x1_stride,
-                                int x2_stride, int group_size);
-void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div,
-                        MatrixDim d);
-void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst,
-                   MatrixDim d, int src_stride, int A_trans);
+void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
+                            const double *Adata, int A_num_rows, int A_num_cols,
+                            int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data,
+                            int B_num_blocks, double alpha, double beta,
+                            int B_trans);
+void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
+                            const float *Adata, int A_num_rows, int A_num_cols,
+                            int A_row_stride, int A_col_stride,
+                            const CuBlockMatrixData *B_cu_data,
+                            int B_num_blocks, float alpha, float beta,
+                            int B_trans);
+void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
+                          double *dst, MatrixDim d, int src_stride,
+                          int A_trans);
 void cudaF_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
                           int32_cuda num_row_blocks, int32_cuda num_col_blocks,
                           float *dst, MatrixDim d, int src_stride, int A_trans);
-void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
-                               const float *C, float *dst, MatrixDim d,
-                               int stride_a, int stride_b, int stride_c);
-void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col,
-                           float beta, float *dst, MatrixDim d);
-void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row,
-                           float beta, float *dst, MatrixDim d);
+void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                            MatrixDim mat_dim, const double *mat2,
+                            int mat2_row_stride, int mat2_col_stride,
+                            const double *vec, double beta);
 void cudaF_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
                             MatrixDim mat_dim, const float *mat2,
                             int mat2_row_stride, int mat2_col_stride,
                             const float *vec, float beta);
+void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                   double *dst, MatrixDim d, int src_stride, int A_trans);
+void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst,
+                   MatrixDim d, int src_stride, int A_trans);
+void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
+                                const double *srcA_data,
+                                const double *srcB_data, MatrixDim dim,
+                                int srcA_stride, int srcB_stride, double alpha,
+                                double beta);
 void cudaF_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
                                 const float *srcA_data, const float *srcB_data,
                                 MatrixDim dim, int srcA_stride, int srcB_stride,
                                 float alpha, float beta);
-/*
- * CuVector
- */
-void cudaF_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                        const MatrixDim d);
-void cudaF_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                        const MatrixDim d);
-void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                        const MatrixDim d);
-void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
-                         float changed);
-void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a,
-                           float param_1, float param_2, float param_3,
-                           int* flag, int dim);
-void cublas_copy_kaldi_fd(int Gr, int Bl, int n, const float* x, int incx,
-                          double* y, int incy);
-void cublas_copy_kaldi_df(int Gr, int Bl, int n, const double* x, int incx,
-                          float* y, int incy);
-void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
-void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
-void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
-                   int inc);
-void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
-                   int inc);
-void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B,
-                               MatrixDim dA, int B_stride, float* value);
-void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
-                         MatrixDim dA, int B_stride, float* value);
-void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
-                                const float* M, const MatrixDim dim_M,
-                                const float* N, const int stride_N,
-                                const float beta, float* v);
-void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
-                                const float* M, const int strid_M,
-                                const float* N, const MatrixDim dim_N,
-                                const float beta, float* v);
-void cudaF_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
-                               const float* M, const int strid_M,
-                               const float* N, const MatrixDim dim_N,
-                               const float beta, float* v);
+void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                          const double *src_data, MatrixDim src_dim,
+                          const Int32Pair *indexes);
+void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                          const float *src_data, MatrixDim src_dim,
+                          const Int32Pair *indexes);
+void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride);
+void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride);
+void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                           const double* const * src, MatrixDim dst_dim);
+void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                           const float* const * src, MatrixDim dst_dim);
+void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
+                              double* const * dst, const double* src,
+                              MatrixDim src_dim);
+void cudaF_add_to_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
+                              const float* src, MatrixDim src_dim);
+void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
+                    const double alpha, int dim);
+void cudaF_add_vec2(dim3 Gr, dim3 Bl, float* mat, const float* vec,
+                    const float alpha, int dim);
+void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col,
+                           double beta, double *dst, MatrixDim d);
+void cudaF_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col,
+                           float beta, float *dst, MatrixDim d);
+void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row,
+                           double beta, double *dst, MatrixDim d);
+void cudaF_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha, const float *row,
+                           float beta, float *dst, MatrixDim d);
+void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
+                       const double* y, double beta, int dim);
 void cudaF_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x,
                        const float* y, float beta, int dim);
-void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                const float* mat, MatrixDim dmat, int dim);
-void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                const float* mat, MatrixDim dmat, int dim);
-void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
-void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
-                                     const float *src, int dim);
-void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
-                           float* num, int dim);
-void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val,
-                             float* num, int dim);
-void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
-void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
-void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
-void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
-// Note: B_trans is nonzero if B is transposed.
-void cudaF_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
-                            const float *Adata, int A_num_rows, int A_num_cols,
-                            int A_row_stride, int A_col_stride,
-                            const CuBlockMatrixData *B_cu_data,
-                            int B_num_blocks, float alpha, float beta,
-                            int B_trans);
-void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
-                             int num_blocks, const float *C_data,
-                             int C_num_cols, int C_row_stride, int C_col_stride,
-                             const float *D_data, int D_row_stride,
-                             int D_col_stride, float alpha, float beta);
-/*
- * cu::
- */
-void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
-                          MatrixDim d, int src_stride);
-void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
-                              MatrixDim y_dim, int x_stride);
-void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                      int src_stride);
-void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                       int src_stride, int group_size, float power);
-void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
-                             const float *x, MatrixDim x_d, float tartget_rms,
-                             bool add_log_stddev);
-void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
-                            MatrixDim d, int src_stride, int group_size,
-                            float power);
-void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                     int src_stride, int group_size);
-void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                   int src_stride);
-void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
+                         MatrixDim d);
+void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
+                         MatrixDim d);
+void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
+                       MatrixDim d);
+void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
+                       MatrixDim d);
+void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
+void cudaF_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d);
+void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
+                         bool include_sign, MatrixDim d);
+void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
+                         bool include_sign, MatrixDim d);
+void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
+void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d);
+void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
+                             int num_blocks, const double *C_data,
+                             int C_num_cols, int C_row_stride, int C_col_stride,
+                             const double *D_data, int D_row_stride,
+                             int D_col_stride, double alpha, double beta);
+void cudaF_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
+                             int num_blocks, const float *C_data,
+                             int C_num_cols, int C_row_stride, int C_col_stride,
+                             const float *D_data, int D_row_stride,
+                             int D_col_stride, float alpha, float beta);
+void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1,
+                                const double *x2, MatrixDim y_dim,
+                                int x1_stride, int x2_stride, int group_size);
+void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
+                                const float *x2, MatrixDim y_dim, int x1_stride,
+                                int x2_stride, int group_size);
+void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int s,
+                          const double* z, MatrixDim d, double* z2,
+                          MatrixDim d2, double* t);
+void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
+                          const float* z, MatrixDim d, float* z2, MatrixDim d2,
+                          float* t);
+void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                const double* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                const float* mat, MatrixDim dmat, int dim);
+void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                const double* mat, MatrixDim dmat, int dim);
+void cudaF_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                const float* mat, MatrixDim dmat, int dim);
+void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride);
-void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                        const float *y, MatrixDim d, int e_stride,
-                        int y_stride);
-void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                int src_stride);
-void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                     const float *y, MatrixDim d, int e_stride, int y_stride);
-void cudaF_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           MatrixDim d, int src_stride, const float *a,
-                           const float *b);
-void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                                const float *y, MatrixDim d, int e_stride,
-                                int y_stride, const float *a, const float *b);
-
-void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1,
-                         float lr, MatrixDim d, int stride_grad);
-void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val,
-                           int32_cuda *vec_id, MatrixDim d);
-void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                     float *mat_net_out, float *vec_log_post, MatrixDim d);
-void cudaF_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
-                        const float* value, const int value_stride,
-                        const float* diff, const int diff_stride);
-void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
+void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                     int src_stride);
+void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                              MatrixDim d_out, const double *v_in);
+void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
                               const float *v_in);
-
-void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
-                     const int32_cuda *copy_from, MatrixDim d_out,
-                     MatrixDim d_in);
-void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
-                  const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
-void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
-                            const float* out_value, const int out_value_stride,
-                            const float* out_deriv, const int out_deriv_stride,
-                            float* in_deriv);
-void cudaF_one(int Gr, int Bl, float* x, int dim);
+void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
+                const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
 void cudaF_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
                 const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
+void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
+                           const double* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
+                                 const double* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out,
+                           const float* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                 const float* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out,
+                           const double* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
+                                 const double* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out,
+                           const float* mat_in, MatrixDim d_out,
+                           MatrixDim d_in);
+void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                 const float* mat_in, MatrixDim d_out,
+                                 MatrixDim d_in);
+void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out,
+                            const MatrixElement<double>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                  const MatrixElement<double>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out,
+                            const MatrixElement<float>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                  const MatrixElement<float>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out,
+                            const MatrixElement<double>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                  const MatrixElement<double>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out,
+                            const MatrixElement<float>* smat_in,
+                            MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                  const MatrixElement<float>* smat_in,
+                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
+void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
+                        MatrixDim d_out);
 void cudaF_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
                         MatrixDim d_out);
-void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
-                      MatrixDim d_in);
-void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
-                      MatrixDim d_in);
-void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
-                     MatrixDim d_in);
-void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                               float alpha, MatrixElement<float>* x,
-                               int num_elements);
-void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                     float alpha, const Int32Pair* indices,
-                                     const float* x, int s, float* data);
-void cudaF_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x, int s,
-                          const float* z, MatrixDim d, float* z2, MatrixDim d2,
-                          float* t);
-void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T,
-                      MatrixDim tdim, float *S, MatrixDim sdim);
-void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                             const float *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);
-void cudaF_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                          const float *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);
-void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
-                         const Int32Pair *indices, int indices_size,
-                         float *output);
-
-void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
-                              const float *mat2, float *mask,
-                              MatrixDim mat1_dim, int mat2_stride,
-                              int mask_stride);
-
-/*********************************************************
- * double CUDA kernel calls
- */
-
-/*
- * CuMatrix
- */
-void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
-void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
-void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                            MatrixDim mat_dim, const double *vec,
-                            const double *mat2, int mat2_row_stride,
-                            int mat2_col_stride, double beta);
-void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B,
-                              MatrixDim dmat);
-void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
-                               MatrixDim dmat);
 void cudaD_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
                         MatrixDim dmat);
 void cudaDF_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
                          MatrixDim dmat);
-void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d);
-void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                         bool include_sign, MatrixDim d);
-void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                       MatrixDim d);
-void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+void cudaFD_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
+                         MatrixDim dmat);
+void cudaF_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
+                        MatrixDim dmat);
+void cudaD_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const double* B,
+                              MatrixDim dmat);
+void cudaDF_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
+                               MatrixDim dmat);
+void cudaFD_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
+                               MatrixDim dmat);
+void cudaF_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
+                              MatrixDim dmat);
+void cublas_copy_kaldi_df(int Gr, int Bl, int n, const double* x, int incx,
+                          float* y, int incy);
+void cublas_copy_kaldi_fd(int Gr, int Bl, int n, const float* x, int incx,
+                          double* y, int incy);
+void cudaD_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA);
+void cudaF_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
+void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride);
-void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                    int src_stride);
-void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
+void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                      int src_stride);
 void cudaD_copy_rows_direct(dim3 Gr, dim3 Bl, double* dst,
                             const double* const * src, MatrixDim dst_dim);
+void cudaF_copy_rows_direct(dim3 Gr, dim3 Bl, float* dst,
+                            const float* const * src, MatrixDim dst_dim);
+void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                              MatrixDim d_out, const double *v_in);
+void cudaF_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
+                              const float *v_in);
 void cudaD_copy_to_rows_direct(dim3 Gr, dim3 Bl, double* const * dst,
                                const double* src, MatrixDim src_dim);
-void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                    const double* src, const MatrixIndexT_cuda* reorder,
-                    MatrixDim dst_dim, int src_stride);
-void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                           const double* const * src, MatrixDim dst_dim);
-void cudaD_add_to_rows_direct(dim3 Gr, dim3 Bl, double alpha,
-                              double* const * dst, const double* src,
-                              MatrixDim src_dim);
-void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val,
-                         MatrixDim d);
-void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
-void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
-void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
-void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
-void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
-void cudaD_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
-void cudaD_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
-                    const double alpha, int dim);
-void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value,
-                             int dim);
-void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
-void cudaD_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d);
-void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
-                        MatrixDim dst_d, int src_stride);
-void cudaD_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
-                        MatrixDim dst_d, int src_stride);
-void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d,
-               int src_stride);
-void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
-                        MatrixDim d);
-void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
-                        MatrixDim d);
-void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x,
-                              MatrixDim d, int src_stride, int group_size);
+void cudaF_copy_to_rows_direct(dim3 Gr, dim3 Bl, float* const * dst,
+                               const float* src, MatrixDim src_dim);
+void cudaD_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimB);
+void cudaF_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA);
 void cudaD_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id, const double *iv,
                             const double *ov, const double* od,
                             MatrixDim id_dim, int iv_stride, int ov_stride,
                             int od_stride, int group_size, double power);
-void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1,
-                                const double *x2, MatrixDim y_dim,
-                                int x1_stride, int x2_stride, int group_size);
+void cudaF_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
+                            const float *ov, const float* od, MatrixDim id_dim,
+                            int iv_stride, int ov_stride, int od_stride,
+                            int group_size, float power);
+void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
+                            const double* out_value, const int out_value_stride,
+                            const double* out_deriv, const int out_deriv_stride,
+                            double* in_deriv);
+void cudaF_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
+                            const float* out_value, const int out_value_stride,
+                            const float* out_deriv, const int out_deriv_stride,
+                            float* in_deriv);
+void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const double* input,
+                                  const int in_stride, const double* params,
+                                  const int params_stride,
+                                  const double* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const double* self_repair_config,
+                                  double count, double* input_deriv,
+                                  const int input_deriv_stride,
+                                  double* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  double* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride);
+void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                  const int num_rows, const float* input,
+                                  const int in_stride, const float* params,
+                                  const int params_stride,
+                                  const float* output_deriv,
+                                  const int output_deriv_stride,
+                                  const double* deriv_sum_in,
+                                  const int deriv_sum_in_stride,
+                                  const float* self_repair_config, double count,
+                                  float* input_deriv,
+                                  const int input_deriv_stride,
+                                  float* params_deriv,
+                                  const int params_deriv_stride,
+                                  double* value_sum_out,
+                                  const int value_sum_out_stride,
+                                  double* deriv_sum_out,
+                                  const int deriv_sum_out_stride,
+                                  float* self_repair_sum_out,
+                                  const int self_repair_sum_out_stride);
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                  int id_stride, const double *iv,
+                                  MatrixDim iv_dim, const double* od,
+                                  int od_stride, double target_rms,
+                                  bool add_log_stddev);
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                  int id_stride, const float *iv,
+                                  MatrixDim iv_dim, const float* od,
+                                  int od_stride, float target_rms,
+                                  bool add_log_stddev);
+void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                                const double *y, MatrixDim d, int e_stride,
+                                int y_stride, const double *a, const double *b);
+void cudaF_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                                const float *y, MatrixDim d, int e_stride,
+                                int y_stride, const float *a, const float *b);
+void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                        const double *y, MatrixDim d, int e_stride,
+                        int y_stride);
+void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                        const float *y, MatrixDim d, int e_stride,
+                        int y_stride);
+void cudaD_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
+                        const double* value, const int value_stride,
+                        const double* diff, const int diff_stride);
+void cudaF_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
+                        const float* value, const int value_stride,
+                        const float* diff, const int diff_stride);
+void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                     const double *y, MatrixDim d, int e_stride, int y_stride);
+void cudaF_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                     const float *y, MatrixDim d, int e_stride, int y_stride);
+void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                     double *mat_net_out, double *vec_log_post, MatrixDim d);
+void cudaF_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                     float *mat_net_out, float *vec_log_post, MatrixDim d);
+void cudaD_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                        MatrixDim dst_d, int src_stride);
+void cudaF_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                        MatrixDim dst_d, int src_stride);
 void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div,
                         MatrixDim d);
-void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
-                   double *dst, MatrixDim d, int src_stride, int A_trans);
-void cudaD_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha, const double *src,
-                          int32_cuda num_row_blocks, int32_cuda num_col_blocks,
-                          double *dst, MatrixDim d, int src_stride,
-                          int A_trans);
-void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
-                               const double *B, const double *C, double *dst,
-                               MatrixDim d, int stride_a, int stride_b,
-                               int stride_c);
-void cudaD_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col,
-                           double beta, double *dst, MatrixDim d);
-void cudaD_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha, const double *row,
-                           double beta, double *dst, MatrixDim d);
-void cudaD_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                            MatrixDim mat_dim, const double *mat2,
-                            int mat2_row_stride, int mat2_col_stride,
-                            const double *vec, double beta);
-void cudaD_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
-                                const double *srcA_data,
-                                const double *srcB_data, MatrixDim dim,
-                                int srcA_stride, int srcB_stride, double alpha,
-                                double beta);
-
-/*
- * CuVector
- */
+void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div,
+                        MatrixDim d);
+void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
+                              const double *mat2, double *mask,
+                              MatrixDim mat1_dim, int mat2_stride,
+                              int mask_stride);
+void cudaF_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
+                              const float *mat2, float *mask,
+                              MatrixDim mat1_dim, int mat2_stride,
+                              int mask_stride);
+void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val,
+                           int32_cuda *vec_id, MatrixDim d);
+void cudaF_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat, float *vec_val,
+                           int32_cuda *vec_id, MatrixDim d);
+void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                     int src_stride, int group_size);
+void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                     int src_stride, int group_size);
+void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                       MatrixDim d, int src_stride, int group_size,
+                       double power);
+void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                       int src_stride, int group_size, float power);
+void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                            MatrixDim d, int src_stride, int group_size,
+                            double power);
+void cudaF_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+                            MatrixDim d, int src_stride, int group_size,
+                            float power);
+void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                     int src_stride);
+void cudaF_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                     int src_stride);
+void cuda_int32_add(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
+                    MatrixDim d);
+void cuda_int32_set_const(dim3 Gr, dim3 Bl, int32_cuda *mat, int32_cuda value,
+                          MatrixDim d);
+void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
+void cudaF_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d);
+void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
+                              MatrixDim y_dim, int x_stride);
+void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
+                              MatrixDim y_dim, int x_stride);
+void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                             const int in_stride, const double* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             double* out);
+void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                             const int in_stride, const float* params,
+                             const int params_stride, const int out_stride,
+                             const int cell_dim, const int num_rows,
+                             float* out);
+void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                               double alpha, MatrixElement<double>* x,
+                               int num_elements);
+void cudaF_matrix_add_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                               float alpha, MatrixElement<float>* x,
+                               int num_elements);
+void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                     double alpha, const Int32Pair* indices,
+                                     const double* x, int s, double* data);
+void cudaF_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                     float alpha, const Int32Pair* indices,
+                                     const float* x, int s, float* data);
+void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         double *output);
+void cudaF_matrix_lookup(dim3 Gr, dim3 Bl, const float *data, MatrixDim dim,
+                         const Int32Pair *indices, int indices_size,
+                         float *output);
+void cudaD_max(dim3 Gr, dim3 Bl, double *mat, const double *A, MatrixDim dst_d,
+               int src_stride);
+void cudaF_max(dim3 Gr, dim3 Bl, float *mat, const float *A, MatrixDim dst_d,
+               int src_stride);
 void cudaD_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
                         const MatrixDim d);
+void cudaF_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                        const MatrixDim d);
+void cudaD_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
+               MatrixDim mat_d, int other_stride);
+void cudaF_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
+               MatrixDim mat_d, int other_stride);
 void cudaD_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
                         const MatrixDim d);
-void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
+void cudaF_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
                         const MatrixDim d);
+void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
+                        MatrixDim d);
+void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
+                        MatrixDim d);
+void cudaD_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                        MatrixDim dst_d, int src_stride);
+void cudaF_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                        MatrixDim dst_d, int src_stride);
+void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x,
+                              MatrixDim d, int src_stride, int group_size);
+void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
+                              MatrixDim d, int src_stride, int group_size);
+void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale,
+                        MatrixDim d);
+void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
+                        MatrixDim d);
+void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride,
+                             const double *x, MatrixDim x_d, double tartget_rms,
+                             bool add_log_stddev);
+void cudaF_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                             const float *x, MatrixDim x_d, float tartget_rms,
+                             bool add_log_stddev);
+void cudaD_one(int Gr, int Bl, double* x, int dim);
+void cudaF_one(int Gr, int Bl, float* x, int dim);
+void cudaD_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           MatrixDim d, int src_stride, const double *a,
+                           const double *b);
+void cudaF_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           MatrixDim d, int src_stride, const float *a,
+                           const float *b);
+void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
+                     const int32_cuda *copy_from, MatrixDim d_out,
+                     MatrixDim d_in);
+void cudaF_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
+                     const int32_cuda *copy_from, MatrixDim d_out,
+                     MatrixDim d_in);
+void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1,
+                         double lr, MatrixDim d, int stride_grad);
+void cudaF_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad, float l1,
+                         float lr, MatrixDim d, int stride_grad);
 void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig,
                          double changed);
+void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
+                         float changed);
+void cudaD_scale_diag_packed(int Gr, int Bl, double* mat, double value,
+                             int dim);
+void cudaF_scale_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaD_scale(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaF_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
 void cudaD_set_bias_params(int Gr, int Bl, double* v, const double* a,
                            double param_1, double param_2, double param_3,
                            int* flag, int dim);
-void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
-                            int dim);
-void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
-void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim,
-                   int inc);
-void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim,
-                   int inc);
-void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
-                               const double* B, MatrixDim dA, int B_stride,
-                               double* value);
-void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
-                         MatrixDim dA, int B_stride, double* value);
-void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
-                                const double* M, const MatrixDim dim_M,
-                                const double* N, const int stride_N,
-                                const double beta, double* v);
-void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
-                                const double* M, const int strid_M,
-                                const double* N, const MatrixDim dim_N,
-                                const double beta, double* v);
-void cudaD_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
-                               const double* M, const int strid_M,
-                               const double* N, const MatrixDim dim_N,
-                               const double beta, double* v);
-void cudaD_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x,
-                       const double* y, double beta, int dim);
-void cudaD_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                const double* mat, MatrixDim dmat, int dim);
-void cudaD_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                const double* mat, MatrixDim dmat, int dim);
-void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
-void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
-                                     const double *src, int dim);
-void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
-                           float* num, int dim);
-void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val,
-                             float* num, int dim);
-void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
-void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
-void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
-void cudaD_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d);
-// note: B_trans is nonzero if B is tranposed.
-void cudaD_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
-                            const double *Adata, int A_num_rows, int A_num_cols,
-                            int A_row_stride, int A_col_stride,
-                            const CuBlockMatrixData *B_cu_data,
-                            int B_num_blocks, double alpha, double beta,
-                            int B_trans);
-void cudaD_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_data,
-                             int num_blocks, const double *C_data,
-                             int C_num_cols, int C_row_stride, int C_col_stride,
-                             const double *D_data, int D_row_stride,
-                             int D_col_stride, double alpha, double beta);
-
-/*
- * cu::
- */
-void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
-                          MatrixDim d, int src_stride);
-void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
-                              MatrixDim y_dim, int x_stride);
-void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                      int src_stride);
-void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                       MatrixDim d, int src_stride, int group_size,
-                       double power);
-void cudaD_normalize_per_row(size_t Gr, size_t Bl, double *y, int y_stride,
-                             const double *x, MatrixDim x_d, double tartget_rms,
-                             bool add_log_stddev);
-void cudaD_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                            MatrixDim d, int src_stride, int group_size,
-                            double power);
-void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                     int src_stride, int group_size);
+void cudaF_set_bias_params(int Gr, int Bl, float* v, const float* a,
+                           float param_1, float param_2, float param_3,
+                           int* flag, int dim);
+void cudaD_set_const(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d);
+void cudaF_set_const(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d);
+void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
+void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
+void cudaD_set_diag_packed(int Gr, int Bl, double* mat, double value, int dim);
+void cudaF_set_diag_packed(int Gr, int Bl, float* mat, float value, int dim);
+void cudaD_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
+                               const double *B, const double *C, double *dst,
+                               MatrixDim d, int stride_a, int stride_b,
+                               int stride_c);
+void cudaF_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B,
+                               const float *C, float *dst, MatrixDim d,
+                               int stride_a, int stride_b, int stride_c);
+void cudaD_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
+void cudaF_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
 void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
                    int src_stride);
-void cudaD_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                     int src_stride);
-void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                        const double *y, MatrixDim d, int e_stride,
-                        int y_stride);
-void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                int src_stride);
-void cudaD_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                     const double *y, MatrixDim d, int e_stride, int y_stride);
-void cudaD_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           MatrixDim d, int src_stride, const double *a,
-                           const double *b);
-void cudaD_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                                const double *y, MatrixDim d, int e_stride,
-                                int y_stride, const double *a, const double *b);
-
-void cudaD_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad, double l1,
-                         double lr, MatrixDim d, int stride_grad);
-void cudaD_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat, double *vec_val,
-                           int32_cuda *vec_id, MatrixDim d);
-void cudaD_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                     double *mat_net_out, double *vec_log_post, MatrixDim d);
-void cudaD_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
-                        const double* value, const int value_stride,
-                        const double* diff, const int diff_stride);
-void cudaD_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                              MatrixDim d_out, const double *v_in);
-
-void cudaD_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
-                     const int32_cuda *copy_from, MatrixDim d_out,
-                     MatrixDim d_in);
+void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                   int src_stride);
+void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                      int src_stride);
+void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                      int src_stride);
+void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x,
+                          MatrixDim d, int src_stride);
+void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
+                          MatrixDim d, int src_stride);
 void cudaD_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
                   const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
-void cudaD_diff_log_softmax(dim3 Gr, dim3 Bl, const MatrixDim in_deriv_dim,
-                            const double* out_value, const int out_value_stride,
-                            const double* out_deriv, const int out_deriv_stride,
-                            double* in_deriv);
-void cudaD_one(int Gr, int Bl, double* x, int dim);
-void cudaD_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
-                const int32_cuda *copy_from, MatrixDim d_out, MatrixDim d_in);
-void cudaD_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
-                        MatrixDim d_out);
+void cudaF_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
+                  const int32_cuda *off, MatrixDim d_out, MatrixDim d_in);
+void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                             const double *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);
+void cudaF_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                             const float *src_data, MatrixDim src_dim,
+                             const Int32Pair *indices);
+void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                        const MatrixDim d);
+void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                        const MatrixDim d);
+void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
+                      const double* T, MatrixDim tdim, double *S,
+                      MatrixDim sdim);
+void cudaF_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta, const float* T,
+                      MatrixDim tdim, float *S, MatrixDim sdim);
 void cudaD_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
                       MatrixDim d_in);
-void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
+void cudaF_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
                       MatrixDim d_in);
 void cudaD_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
                      MatrixDim d_in);
-
-// some mostly mixed-type kernels.
-void cuda_copy_from_mat_df(dim3 Gr, dim3 Bl, double* mat_out,
-                           const float* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_ff(dim3 Gr, dim3 Bl, float* mat_out,
-                           const float* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_fd(dim3 Gr, dim3 Bl, float *mat_out,
-                           const double* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_dd(dim3 Gr, dim3 Bl, double *mat_out,
-                           const double* mat_in, MatrixDim d_out,
-                           MatrixDim d_in);
-void cuda_copy_from_mat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                 const float* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-void cuda_copy_from_mat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                 const float* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-void cuda_copy_from_mat_fd_trans(dim3 Gr, dim3 Bl, float *mat_out,
-                                 const double* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-void cuda_copy_from_mat_dd_trans(dim3 Gr, dim3 Bl, double *mat_out,
-                                 const double* mat_in, MatrixDim d_out,
-                                 MatrixDim d_in);
-
-void cuda_copy_from_smat_ff(dim3 Gr, dim3 Bl, float* mat_out,
-                            const MatrixElement<float>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_fd(dim3 Gr, dim3 Bl, float* mat_out,
-                            const MatrixElement<double>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_df(dim3 Gr, dim3 Bl, double* mat_out,
-                            const MatrixElement<float>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_dd(dim3 Gr, dim3 Bl, double* mat_out,
-                            const MatrixElement<double>* smat_in,
-                            MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_ff_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                  const MatrixElement<float>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_fd_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                  const MatrixElement<double>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_df_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                  const MatrixElement<float>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-void cuda_copy_from_smat_dd_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                  const MatrixElement<double>* smat_in,
-                                  MatrixDim d_out, MatrixIndexT_cuda d_in);
-
-void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
-                          const MatrixElement<float>* smat_in,
-                          MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                          float* trace_vec_out);
-void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                                float* trace_vec_out);
+void cudaF_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
+                     MatrixDim d_in);
+void cudaD_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
+                      MatrixDim d_in);
+void cudaF_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
+                      MatrixDim d_in);
+void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                int src_stride);
+void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                int src_stride);
+void cudaD_trace(int Gr, int Bl, double* mat, double* value, int dim);
+void cudaF_trace(int Gr, int Bl, float* mat, float* value, int dim);
+void cudaD_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A, const double* B,
+                         MatrixDim dA, int B_stride, double* value);
+void cudaF_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
+                         MatrixDim dA, int B_stride, float* value);
+void cudaD_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
+                               const double* B, MatrixDim dA, int B_stride,
+                               double* value);
+void cudaF_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A, const float* B,
+                               MatrixDim dA, int B_stride, float* value);
 void cudaD_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in,
                           const MatrixElement<double>* smat_in,
                           MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
                           double* trace_vec_out);
+void cudaF_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
+                          const MatrixElement<float>* smat_in,
+                          MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                          float* trace_vec_out);
 void cudaD_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
                                 const MatrixElement<double>* smat_in,
                                 MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
                                 double* trace_vec_out);
-
-void cudaD_matrix_add_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                               double alpha, MatrixElement<double>* x,
-                               int num_elements);
-void cudaD_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                     double alpha, const Int32Pair* indices,
-                                     const double* x, int s, double* data);
-void cudaD_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x, int s,
-                          const double* z, MatrixDim d, double* z2,
-                          MatrixDim d2, double* t);
-
-void cudaD_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
-                      const double* T, MatrixDim tdim, double *S,
-                      MatrixDim sdim);
-void cudaD_sum_column_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                             const double *src_data, MatrixDim src_dim,
-                             const Int32Pair *indices);
-void cudaD_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                          const double *src_data, MatrixDim src_dim,
-                          const Int32Pair *indexes);
-void cudaD_matrix_lookup(dim3 Gr, dim3 Bl, const double *data, MatrixDim dim,
-                         const Int32Pair *indices, int indices_size,
-                         double *output);
-
-void cudaD_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                              const double *mat2, double *mask,
-                              MatrixDim mat1_dim, int mat2_stride,
-                              int mask_stride);
-
-void cudaD_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
-                             const int in_stride, const double* params,
-                             const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
-                             double* out);
-void cudaF_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
-                             const int in_stride, const float* params,
-                             const int params_stride, const int out_stride,
-                             const int cell_dim, const int num_rows,
-                             float* out);
-void cudaD_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                  const int num_rows, const double* input,
-                                  const int in_stride, const double* params,
-                                  const int params_stride,
-                                  const double* output_deriv,
-                                  const int output_deriv_stride,
-                                  const double* deriv_sum_in,
-                                  const int deriv_sum_in_stride,
-                                  const double* self_repair_config,
-                                  double count, double* input_deriv,
-                                  const int input_deriv_stride,
-                                  double* params_deriv,
-                                  const int params_deriv_stride,
-                                  double* value_sum_out,
-                                  const int value_sum_out_stride,
-                                  double* deriv_sum_out,
-                                  const int deriv_sum_out_stride,
-                                  double* self_repair_sum_out,
-                                  const int self_repair_sum_out_stride);
-void cudaF_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                  const int num_rows, const float* input,
-                                  const int in_stride, const float* params,
-                                  const int params_stride,
-                                  const float* output_deriv,
-                                  const int output_deriv_stride,
-                                  const double* deriv_sum_in,
-                                  const int deriv_sum_in_stride,
-                                  const float* self_repair_config, double count,
-                                  float* input_deriv,
-                                  const int input_deriv_stride,
-                                  float* params_deriv,
-                                  const int params_deriv_stride,
-                                  double* value_sum_out,
-                                  const int value_sum_out_stride,
-                                  double* deriv_sum_out,
-                                  const int deriv_sum_out_stride,
-                                  float* self_repair_sum_out,
-                                  const int self_repair_sum_out_stride);
-
-
-void cudaD_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                              MatrixDim d_out, const double *v_in);
-void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
-                              const float *v_in);
+void cudaF_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                float* trace_vec_out);
+void cudaD_vec_apply_ceiling(int Gr, int Bl, double* v, double ceiling_val,
+                             float* num, int dim);
+void cudaF_vec_apply_ceiling(int Gr, int Bl, float* v, float ceiling_val,
+                             float* num, int dim);
+void cudaD_vec_apply_exp(int Gr, int Bl, double* v, int dim);
+void cudaF_vec_apply_exp(int Gr, int Bl, float* v, int dim);
+void cudaD_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
+                           float* num, int dim);
+void cudaF_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
+                           float* num, int dim);
+void cudaD_vec_apply_log(int Gr, int Bl, double* v, double* flag, int dim);
+void cudaF_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim);
+void cudaD_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
+                                     const double *src, int dim);
+void cudaF_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
+                                     const float *src, int dim);
+void cudaD_vec_max(int Gr, int Bl, const double* v, double* value, int dim,
+                   int inc);
+void cudaF_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
+                   int inc);
+void cudaD_vec_min(int Gr, int Bl, const double* v, double* value, int dim,
+                   int inc);
+void cudaF_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
+                   int inc);
+void cudaD_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
+                            int dim);
+void cudaF_vec_mul_elements(int Gr, int Bl, float* v, const float* a, int dim);
+void cudaD_vec_soft_max(int Gr, int Bl, double* v, int dim);
+void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
+void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
+void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 
 } // extern "C"
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index abb4efd47ef..60800d9568d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -385,6 +385,20 @@ static void _max(Real* mat, const Real* A, MatrixDim dst_d, int src_stride) {
   }
 }
 
+template<typename Real>
+__global__
+static void _min(Real* mat, const Real* other, MatrixDim mat_d,
+                 int other_stride) {
+  int32_cuda j = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda i = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda mat_index = i * mat_d.stride + j;
+  int32_cuda other_index = i * other_stride + j;
+  if (j < mat_d.cols && i < mat_d.rows) {
+    Real a = mat[mat_index], b = other[other_index];
+    mat[mat_index] = fmin(a, b);
+  }
+}
+
 template<typename Real>
 __global__
 static void _vec_mul_elements(Real* v, const Real* a, int dim) {
@@ -1206,7 +1220,7 @@ static void _equal_element_mask(const Real *mat1, const Real *mat2, Real *mask,
 }
 
 enum EnumTransformReduce {
-  SUM, MAX, MIN, LINFNORM, L2NORM, L1NORM, L0NORM, LPNORM
+  SUMAB, SUM, MAX, MIN, LINFNORM, L2NORM, L1NORM, L0NORM, LPNORM
 };
 
 template<EnumTransformReduce TransReduceType, typename Real>
@@ -1229,6 +1243,35 @@ struct TransReduceOp {
   }
 };
 
+template<typename Real>
+struct TransReduceOp<SUMAB, Real> {
+  const Real alpha_;
+  const Real beta_;
+  TransReduceOp(const Real& a, const Real& b) :
+      alpha_(a), beta_(b) {
+  }
+  __forceinline__
+  __device__ Real InitValue() const {
+    return Real(0);
+  }
+  __forceinline__
+  __device__ Real Transform(const Real& x) const {
+    return x;
+  }
+  __forceinline__
+  __device__ Real Reduce(const Real& a, const Real& b) const {
+    return a + b;
+  }
+  __forceinline__
+  __device__ Real PostReduce(const Real& x, const Real& output) const {
+    if (beta_ == Real(0)) {
+      return alpha_ * x;
+    } else {
+      return alpha_ * x + beta_ * output;
+    }
+  }
+};
+
 template<typename Real>
 struct TransReduceOp<SUM, Real> {
   __forceinline__
@@ -2278,7 +2321,7 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
     }
   }
 
-  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (tid == 0) {
     ssum[0] = sqrt(
         fmax(ssum[0] / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
@@ -2301,6 +2344,87 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
 }
 
 
+template<typename Real>
+__global__
+static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
+                                    MatrixDim iv_dim, const Real* od,
+                                    int od_stride, Real target_rms,
+                                    bool add_log_stddev) {
+
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+  const Real kInvNormFloor = 8589934592.0;
+
+  const int tid = threadIdx.x;
+  const int i = blockIdx.x;
+  const Real* iv_row = iv + i * iv_dim.stride;
+  const Real* od_row = od + i * od_stride;
+
+  // reduce to CU1DBLOCK elements per row
+  Real dot_products = Real(0);
+  Real in_norm = Real(0);
+  for (int j = tid; j < iv_dim.cols; j += CU1DBLOCK) {
+    const Real iv_ij = iv_row[j];
+    dot_products += iv_ij * od_row[j];
+    in_norm += iv_ij * iv_ij;
+  }
+  __shared__ Real sprod[CU1DBLOCK];
+  __shared__ Real snorm[CU1DBLOCK];
+  sprod[tid] = dot_products;
+  snorm[tid] = in_norm;
+  __syncthreads();
+
+  // reduce to 2x warpSize elements per row
+# pragma unroll
+  for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {
+    if (tid < shift) {
+      sprod[tid] += sprod[tid + shift];
+      snorm[tid] += snorm[tid + shift];
+    }
+    __syncthreads();
+  }
+
+  // reduce to 1 element per row
+  if (tid < warpSize) {
+#   pragma unroll
+    for (int shift = warpSize; shift > 0; shift >>= 1) {
+      sprod[tid] += sprod[tid + shift];
+      snorm[tid] += snorm[tid + shift];
+    }
+  }
+
+  // broadcast the sum results
+  __syncthreads();
+  dot_products = sprod[0];
+  in_norm = snorm[0];
+
+  Real log_stddev_deriv;
+  if (add_log_stddev) {
+    log_stddev_deriv = Real(1) / max(in_norm, iv_dim.cols * kSquaredNormFloor)
+        * od_row[iv_dim.cols];
+  }
+
+  const Real inv_d_scaled = Real(1) / (iv_dim.cols * target_rms * target_rms);
+  in_norm = Real(1) / sqrt(max(in_norm * inv_d_scaled, kSquaredNormFloor));
+
+  const Real f = in_norm == kInvNormFloor ? Real(0) : in_norm;
+  dot_products *= f * f * f * inv_d_scaled;
+
+  for (int j = tid; j < iv_dim.cols; j += CU1DBLOCK) {
+    const Real iv_ij = iv_row[j];
+    Real id_ij = id[i * id_stride + j];
+    if (add_log_stddev) {
+      id_ij += log_stddev_deriv * iv_ij;
+    }
+    if (id != od) {
+      id_ij += in_norm * od_row[j];
+    } else {
+      id_ij *= in_norm;
+    }
+    id_ij -= dot_products * iv_ij;
+    id[i * id_stride + j] = id_ij;
+  }
+}
+
 // Per-row log-softmax operation on 'x', with writing to 'y'.
 // note, x and y may point to the same memory.  This is equivalent to setting
 // matrix y to matrix x and then, for each row of y, subtracting the offset that
@@ -3350,6 +3474,11 @@ void cudaF_max(dim3 Gr, dim3 Bl, float* mat, const float* A, MatrixDim dst_d,
   _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
+void cudaF_min(dim3 Gr, dim3 Bl, float* mat, const float* other,
+               MatrixDim mat_d, int other_stride) {
+  _min<<<Gr,Bl>>>(mat,other,mat_d,other_stride);
+}
+
 void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float* mat, const float* scale,
                         MatrixDim d) {
   _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d);
@@ -3470,6 +3599,12 @@ void cudaF_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,float>());
 }
+void cudaF_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat,
+                           const MatrixDim d, const float alpha,
+                           const float beta) {
+  _transform_reduce_mat_cols<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, float>(alpha, beta));
+}
 
 void cudaF_replace_value(int Gr, int Bl, float *v, int dim, float orig,
                          float changed) {
@@ -3999,6 +4134,11 @@ void cudaD_max(dim3 Gr, dim3 Bl, double* mat, const double* A, MatrixDim dst_d,
   _max<<<Gr,Bl>>>(mat,A,dst_d,src_stride);
 }
 
+void cudaD_min(dim3 Gr, dim3 Bl, double* mat, const double* other, MatrixDim mat_d,
+               int other_stride) {
+  _min<<<Gr,Bl>>>(mat,other,mat_d,other_stride);
+}
+
 void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double* mat, const double* scale,
                         MatrixDim d) {
   _mul_cols_vec<<<Gr,Bl>>>(mat,scale,d);
@@ -4120,6 +4260,12 @@ void cudaD_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
   _transform_reduce_mat_cols<<<Gr,Bl>>>(result,mat,d,
       TransReduceOp<SUM,double>());
 }
+void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
+                           const MatrixDim d, const double alpha,
+                           const double beta) {
+  _transform_reduce_mat_cols<<<Gr, Bl>>>(result, mat, d,
+      TransReduceOp<SUMAB, double>(alpha, beta));
+}
 
 void cudaD_replace_value(int Gr, int Bl, double *v, int dim, double orig,
                          double changed) {
@@ -4666,3 +4812,20 @@ void cudaF_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out, MatrixDim d_out,
                               const float *v_in) {
   _copy_cols_from_vec<<<Gr, Bl>>>(mat_out, d_out, v_in);
 }
+
+void cudaF_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                  int id_stride, const float *iv,
+                                  MatrixDim iv_dim, const float* od,
+                                  int od_stride, float target_rms,
+                                  bool add_log_stddev) {
+  _diff_normalize_per_row<<<Gr, Bl>>>(id, id_stride, iv, iv_dim, od, od_stride,
+                                      target_rms, add_log_stddev);
+}
+void cudaD_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                  int id_stride, const double *iv,
+                                  MatrixDim iv_dim, const double* od,
+                                  int od_stride, double target_rms,
+                                  bool add_log_stddev) {
+  _diff_normalize_per_row<<<Gr, Bl>>>(id, id_stride, iv, iv_dim, od, od_stride,
+                                      target_rms, add_log_stddev);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 649a25ab67e..77352b5925f 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -38,15 +38,76 @@
 
 namespace kaldi {
 
-/*
- * CuMatrix
- */
-
-inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
-  cudaF_copy_upp_low(Gr, Bl, A, dimA);
+inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result,
+                                 const double* mat, const MatrixDim d,
+                                 const double alpha, const double beta) {
+  cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
 }
-inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
-  cudaF_copy_low_upp(Gr, Bl, A, dimA);
+inline void cuda_add_col_sum_mat(int Gr, int Bl, float* result,
+                                 const float* mat, const MatrixDim d,
+                                 const float alpha, const float beta) {
+  cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);
+}
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                          int src_stride) {
+  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                          int src_stride) {
+  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
+                                     const double* M, const int stride_M,
+                                     const double* N, const MatrixDim dim_N,
+                                     const double beta, double* v) {
+  cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
+                                     const float* M, const int stride_M,
+                                     const float* N, const MatrixDim dim_N,
+                                     const float beta, float* v) {
+  cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
+                                      const double* M, const MatrixDim dim_M,
+                                      const double* N, const int stride_N,
+                                      const double beta, double* v) {
+  cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
+                                      const float* M, const MatrixDim dim_M,
+                                      const float* N, const int stride_N,
+                                      const float beta, float* v) {
+  cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
+                                      const double* M, const int stride_M,
+                                      const double* N, const MatrixDim dim_N,
+                                      const double beta, double* v) {
+  cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
+                                      const float* M, const int stride_M,
+                                      const float* N, const MatrixDim dim_N,
+                                      const float beta, float* v) {
+  cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+}
+inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value,
+                                 int dim) {
+  cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value,
+                                 int dim) {
+  cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                                  MatrixDim mat_dim, const double *vec,
+                                  const double *mat2, int mat2_row_stride,
+                                  int mat2_col_stride, double beta) {
+  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
+                         mat2_row_stride, mat2_col_stride, beta);
 }
 inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
                                   MatrixDim mat_dim, const float *vec,
@@ -55,269 +116,248 @@ inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, float alpha, float *mat,
   cudaF_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
                          mat2_row_stride, mat2_col_stride, beta);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
-                                    MatrixDim dmat) {
-  cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) {
+  cudaD_add(Gr, Bl, mat, value, d);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
-                                    MatrixDim dmat) {
-  cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
+  cudaF_add(Gr, Bl, mat, value, d);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
-                              MatrixDim dmat) {
-  cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
+                                  const double *Adata, int A_num_rows,
+                                  int A_num_cols, int A_row_stride,
+                                  int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data,
+                                  int B_num_blocks, double alpha, double beta,
+                                  int B_trans) {
+  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
+                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
+                         alpha, beta, B_trans);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
-                              MatrixDim dmat) {
-  cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
+                                  const float *Adata, int A_num_rows,
+                                  int A_num_cols, int A_row_stride,
+                                  int A_col_stride,
+                                  const CuBlockMatrixData *B_cu_data,
+                                  int B_num_blocks, float alpha, float beta,
+                                  int B_trans) {
+  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
+                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
+                         alpha, beta, B_trans);
 }
-
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
-                               const double* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
+                                const double *src, int32_cuda num_row_blocks,
+                                int32_cuda num_col_blocks, double *dst,
+                                MatrixDim d, int src_stride, int A_trans) {
+  cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
+                       d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
-                               const float* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
+                                int32_cuda num_row_blocks,
+                                int32_cuda num_col_blocks, float *dst,
+                                MatrixDim d, int src_stride, int A_trans) {
+  cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
+                       d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
-                               const double* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
+                                  MatrixDim mat_dim, const double *mat2,
+                                  int mat2_row_stride, int mat2_col_stride,
+                                  const double *vec, double beta) {
+  cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
+                         mat2_col_stride, vec, beta);
 }
-inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
-                               const float* mat_in, MatrixDim d_out,
-                               MatrixDim d_in) {
-  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
+                                  MatrixDim mat_dim, const float *mat2,
+                                  int mat2_row_stride, int mat2_col_stride,
+                                  const float *vec, float beta) {
+  cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
+                         mat2_col_stride, vec, beta);
 }
-
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                     const double* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
+                         double *dst, MatrixDim d, int src_stride,
+                         int A_trans) {
+  cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                     const float* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src,
+                         float *dst, MatrixDim d, int src_stride, int A_trans) {
+  cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
 }
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                     const double* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
+                                      const double *srcA_data,
+                                      const double *srcB_data, MatrixDim dim,
+                                      int srcA_stride, int srcB_stride,
+                                      double alpha, double beta) {
+  cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
+                             srcA_stride, srcB_stride, alpha, beta);
 }
-inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                     const float* mat_in, MatrixDim d_out,
-                                     MatrixDim d_in) {
-  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
+inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
+                                      const float *srcA_data,
+                                      const float *srcB_data, MatrixDim dim,
+                                      int srcA_stride, int srcB_stride,
+                                      float alpha, float beta) {
+  cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
+                             srcA_stride, srcB_stride, alpha, beta);
 }
-
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_ff(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
+                                const double *src_data, MatrixDim src_dim,
+                                const Int32Pair *indexes) {
+  cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
 }
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
-                                const MatrixElement<double>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_fd(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                                const float *src_data, MatrixDim src_dim,
+                                const Int32Pair *indexes) {
+  cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
 }
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_df(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                          const double* const * src, MatrixDim dst_dim) {
+  cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
-inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
-                                const MatrixElement<double>* smat_in,
-                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_dd(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
+                          const double* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                      const MatrixElement<float>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_ff_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* const * src, MatrixDim dst_dim) {
+  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
-                                      const MatrixElement<double>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_fd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                      const MatrixElement<float>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_df_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
+                             double* const * dst, const double* src,
+                             MatrixDim src_dim) {
+  cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
 }
-inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
-                                      const MatrixElement<double>* smat_in,
-                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
-  cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
+inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
+                             const float* src, MatrixDim src_dim) {
+  cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
 }
-
-inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
-                                const MatrixElement<float>* smat_in,
-                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                                float* trace_vec_out) {
-  cudaF_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                       trace_vec_out);
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
+                          const double alpha, int dim) {
+  cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
 }
-inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
-                                      const MatrixElement<float>* smat_in,
-                                      MatrixDim mat_d_in,
-                                      MatrixIndexT_cuda smat_d_in,
-                                      float* trace_vec_out) {
-  cudaF_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                             trace_vec_out);
+inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec,
+                          const float alpha, int dim) {
+  cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
 }
-inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in,
-                                const MatrixElement<double>* smat_in,
-                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
-                                double* trace_vec_out) {
-  cudaD_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                       trace_vec_out);
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha,
+                                 const double *col, double beta, double *dst,
+                                 MatrixDim d) {
+  cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
 }
-inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
-                                      const MatrixElement<double>* smat_in,
-                                      MatrixDim mat_d_in,
-                                      MatrixIndexT_cuda smat_d_in,
-                                      double* trace_vec_out) {
-  cudaD_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
-                             trace_vec_out);
+inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha,
+                                 const float *col, float beta, float *dst,
+                                 MatrixDim d) {
+  cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
 }
-
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
-  cudaF_apply_exp(Gr, Bl, mat, d);
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha,
+                                 const double *row, double beta, double *dst,
+                                 MatrixDim d) {
+  cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
 }
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power,
-                           MatrixDim dim) {
-  cudaF_apply_pow(Gr, Bl, mat, power, dim);
+inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha,
+                                 const float *row, float beta, float *dst,
+                                 MatrixDim d) {
+  cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
 }
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
-                               bool include_sign, MatrixDim dim) {
-  cudaF_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
+inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v,
+                             const double* x, const double* y, double beta,
+                             int dim) {
+  cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
 }
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
-  cudaF_apply_heaviside(Gr, Bl, mat, dim);
+inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
+                             const float* x, const float* y, float beta,
+                             int dim) {
+  cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
 }
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
-                             MatrixDim dim) {
-  cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
+inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat,
+                               double ceiling_val, MatrixDim dim) {
+  cudaD_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
 }
 inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
                                MatrixDim dim) {
   cudaF_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
 }
-inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                          int src_stride) {
-  cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst,
-                           const float* const * src, MatrixDim dst_dim) {
-  cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
-}
-inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, float* const * dst,
-                              const float* src, MatrixDim src_dim) {
-  cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
-}
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* src, const MatrixIndexT_cuda* reorder,
-                          MatrixDim dst_dim, int src_stride) {
-  cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
-}
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* const * src, MatrixDim dst_dim) {
-  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
-}
-inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, float alpha, float* const * dst,
-                             const float* src, MatrixDim src_dim) {
-  cudaF_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
-}
-inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) {
-  cudaF_trace(Gr, Bl, mat, value, dim);
-}
-inline void cuda_set_diag(int Gr, int Bl, float* mat, float value,
-                          MatrixDim d) {
-  cudaF_set_diag(Gr, Bl, mat, value, d);
-}
-inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value,
-                                 int dim) {
-  cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
-}
-inline void cuda_add_diag_packed(int Gr, int Bl, float* mat, float value,
-                                 int dim) {
-  cudaF_add_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
+  cudaD_apply_exp(Gr, Bl, mat, d);
 }
-inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value,
-                           MatrixDim d) {
-  cudaF_set_const(Gr, Bl, mat, value, d);
+inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
+  cudaF_apply_exp(Gr, Bl, mat, d);
 }
-inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat,
-                                     MatrixDim d) {
-  cudaF_set_zero_above_diag(Gr, Bl, mat, d);
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
+                             MatrixDim dim) {
+  cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
 }
-inline void cuda_add(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
-  cudaF_add(Gr, Bl, mat, value, d);
+inline void cuda_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val,
+                             MatrixDim dim) {
+  cudaF_apply_floor(Gr, Bl, mat, floor_val, dim);
 }
-inline void cuda_add_vec2(dim3 Gr, dim3 Bl, float *mat, const float *vec,
-                          const float alpha, int dim) {
-  cudaF_add_vec2(Gr, Bl, mat, vec, alpha, dim);
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
+  cudaD_apply_heaviside(Gr, Bl, mat, dim);
 }
-inline void cuda_scale_diag_packed(int Gr, int Bl, float* mat, float value,
-                                   int dim) {
-  cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim dim) {
+  cudaF_apply_heaviside(Gr, Bl, mat, dim);
 }
-inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
-  cudaF_scale(Gr, Bl, mat, value, d);
+inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) {
+  cudaD_apply_log(Gr, Bl, mat, d);
 }
 inline void cuda_apply_log(dim3 Gr, dim3 Bl, float *mat, MatrixDim d) {
   cudaF_apply_log(Gr, Bl, mat, d);
 }
-inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                              MatrixDim dst_d, int src_stride) {
-  cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
+inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
+                               bool include_sign, MatrixDim dim) {
+  cudaD_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
 }
-inline void cuda_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                              MatrixDim dst_d, int src_stride) {
-  cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
+inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power,
+                               bool include_sign, MatrixDim dim) {
+  cudaF_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
 }
-inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A,
-                     MatrixDim dst_d, int src_stride) {
-  cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power,
+                           MatrixDim dim) {
+  cudaD_apply_pow(Gr, Bl, mat, power, dim);
 }
-inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                              MatrixDim d) {
-  cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
+inline void cuda_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power,
+                           MatrixDim dim) {
+  cudaF_apply_pow(Gr, Bl, mat, power, dim);
 }
-inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
-                              MatrixDim d) {
-  cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
+inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
+                                const double *x, int incx, double *y,
+                                int incy) {
+  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                    MatrixDim d, int src_stride,
-                                    int group_size) {
-  cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
+inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha,
+                                const float *x, int incx, float *y, int incy) {
+  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
 }
-
-inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
-                                  const float *ov, const float* od,
-                                  MatrixDim id_dim, int iv_stride,
-                                  int ov_stride, int od_stride, int group_size,
-                                  float power) {
-  cudaF_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
-                         od_stride, group_size, power);
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
+                                   CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const double *C_data, int C_num_cols,
+                                   int C_row_stride, int C_col_stride,
+                                   const double *D_data, int D_row_stride,
+                                   int D_col_stride, double alpha,
+                                   double beta) {
+  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}
+inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
+                                   CuBlockMatrixData *B_cu_data, int num_blocks,
+                                   const float *C_data, int C_num_cols,
+                                   int C_row_stride, int C_col_stride,
+                                   const float *D_data, int D_row_stride,
+                                   int D_col_stride, float alpha, float beta) {
+  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
+                          C_row_stride, C_col_stride, D_data, D_row_stride,
+                          D_col_stride, alpha, beta);
+}
+inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y,
+                                      const double *x1, const double *x2,
+                                      MatrixDim y_dim, int x1_stride,
+                                      int x2_stride, int group_size) {
+  cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
+                             group_size);
 }
 inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y,
                                       const float *x1, const float *x2,
@@ -326,280 +366,255 @@ inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y,
   cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
                              group_size);
 }
-inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src,
-                         float *dst, MatrixDim d, int src_stride, int A_trans) {
-  cudaF_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x,
+                                int32 size, const double* z, MatrixDim d,
+                                double* z2, MatrixDim d2, double* t) {
+  cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
 }
-inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, float alpha, const float *src,
-                                int32_cuda num_row_blocks,
-                                int32_cuda num_col_blocks, float *dst,
-                                MatrixDim d, int src_stride, int A_trans) {
-  cudaF_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
-                       d, src_stride, A_trans);
+inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x,
+                                int32 size, const float* z, MatrixDim d,
+                                float* z2, MatrixDim d2, float* t) {
+  cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
 }
-inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A,
-                                     const float *B, const float *C, float *dst,
-                                     MatrixDim d, int stride_a, int stride_b,
-                                     int stride_c) {
-  cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
-                            stride_c);
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                      const double* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha,
-                                 const float *col, float beta, float *dst,
-                                 MatrixDim d) {
-  cudaF_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
+inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
+                                      const float* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, float alpha,
-                                 const float *row, float beta, float *dst,
-                                 MatrixDim d) {
-  cudaF_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                      const double* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta,
-                            const float* T, MatrixDim tdim, float *S,
-                            MatrixDim sdim) {
-  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
+                                      const float* mat, MatrixDim dmat,
+                                      int dim) {
+  cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
 }
-inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, float alpha, float *mat,
-                                  MatrixDim mat_dim, const float *mat2,
-                                  int mat2_row_stride, int mat2_col_stride,
-                                  const float *vec, float beta) {
-  cudaF_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
-                         mat2_col_stride, vec, beta);
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, float *data,
-                                      const float *srcA_data,
-                                      const float *srcB_data, MatrixDim dim,
-                                      int srcA_stride, int srcB_stride,
-                                      float alpha, float beta) {
-  cudaF_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
-                             srcA_stride, srcB_stride, alpha, beta);
+inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-
-/*
- * CuVector
- */
-inline void cuda_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                              const MatrixDim d) {
-  cudaF_max_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                                    MatrixDim d_out, const double *v_in) {
+  cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-inline void cuda_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                              const MatrixDim d) {
-  cudaF_min_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
+                                    MatrixDim d_out, const float *v_in) {
+  cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-inline void cuda_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
-                              const MatrixDim d) {
-  cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
+                      const int32_cuda *copy_from, MatrixDim d_out,
+                      MatrixDim d_in) {
+  cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig,
-                               float changed) {
-  cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
+inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
+                      const int32_cuda *copy_from, MatrixDim d_out,
+                      MatrixDim d_in) {
+  cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat,
-                              const float *vec_div, MatrixDim d) {
-  cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
+                               const double* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_dd(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a,
-                                 float param_1, float param_2, float param_3,
-                                 int* flag, int dim) {
-  cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, double* mat_out,
+                               const float* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_df(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a,
-                                  int dim) {
-  cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
+                               const double* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_fd(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) {
-  cudaF_vec_soft_max(Gr, Bl, v, dim);
+inline void cuda_copy_from_mat(dim3 Gr, dim3 Bl, float* mat_out,
+                               const float* mat_in, MatrixDim d_out,
+                               MatrixDim d_in) {
+  cuda_copy_from_mat_ff(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
-                         int inc) {
-  cudaF_vec_min(Gr, Bl, v, value, dim, inc);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                     const double* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_dd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
-                         int inc) {
-  cudaF_vec_max(Gr, Bl, v, value, dim, inc);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                     const float* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_df_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A,
-                                     const float* B, MatrixDim dA, int B_stride,
-                                     float* value) {
-  cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                     const double* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_fd_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
-                               MatrixDim dA, int B_stride, float* value) {
-  cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_copy_from_mat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                     const float* mat_in, MatrixDim d_out,
+                                     MatrixDim d_in) {
+  cuda_copy_from_mat_ff_trans(Gr, Bl, mat_out, mat_in, d_out, d_in);
 }
-inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha,
-                                      const float* M, const MatrixDim dim_M,
-                                      const float* N, const int stride_N,
-                                      const float beta, float* v) {
-  cudaF_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
+                                const MatrixElement<double>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_dd(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha,
-                                      const float* M, const int stride_M,
-                                      const float* N, const MatrixDim dim_N,
-                                      const float beta, float* v) {
-  cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, double* mat_out,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_df(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const float alpha,
-                                     const float* M, const int stride_M,
-                                     const float* N, const MatrixDim dim_N,
-                                     const float beta, float* v) {
-  cudaF_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
+                                const MatrixElement<double>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_fd(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v,
-                             const float* x, const float* y, float beta,
-                             int dim) {
-  cudaF_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
+inline void cuda_copy_from_smat(dim3 Gr, dim3 Bl, float* mat_out,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_ff(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                      const float* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaF_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                      const MatrixElement<double>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_dd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                      const float* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaF_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, double* mat_out,
+                                      const MatrixElement<float>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_df_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
-                         int inc) {
-  cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                      const MatrixElement<double>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_fd_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
-                                           const float *src, int dim) {
-  cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
+inline void cuda_copy_from_smat_trans(dim3 Gr, dim3 Bl, float* mat_out,
+                                      const MatrixElement<float>* smat_in,
+                                      MatrixDim d_out, MatrixIndexT_cuda d_in) {
+  cuda_copy_from_smat_ff_trans(Gr, Bl, mat_out, smat_in, d_out, d_in);
 }
-inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
-                                 float* num, int dim) {
-  cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
+inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
+                              MatrixDim d_out) {
+  cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
 }
-inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val,
-                                   float* num, int dim) {
-  cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
+inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
+                              MatrixDim d_out) {
+  cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
 }
-inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) {
-  cudaF_vec_apply_exp(Gr, Bl, v, dim);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
+                              MatrixDim dmat) {
+  cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) {
-  cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
+                              MatrixDim dmat) {
+  cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) {
-  cudaF_invert_elements(Gr, Bl, data, d);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const double* B,
+                              MatrixDim dmat) {
+  cudaFD_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-// B_trans nonzero if B transposed.
-inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, float *data, MatrixDim d,
-                                  const float *Adata, int A_num_rows,
-                                  int A_num_cols, int A_row_stride,
-                                  int A_col_stride,
-                                  const CuBlockMatrixData *B_cu_data,
-                                  int B_num_blocks, float alpha, float beta,
-                                  int B_trans) {
-  cudaF_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
-                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
-                         alpha, beta, B_trans);
+inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, float* A, const float* B,
+                              MatrixDim dmat) {
+  cudaF_copy_from_tp(Gr, Bl, A, B, dmat);
 }
-inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
-                                   CuBlockMatrixData *B_cu_data, int num_blocks,
-                                   const float *C_data, int C_num_cols,
-                                   int C_row_stride, int C_col_stride,
-                                   const float *D_data, int D_row_stride,
-                                   int D_col_stride, float alpha, float beta) {
-  cudaF_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
-                          C_row_stride, C_col_stride, D_data, D_row_stride,
-                          D_col_stride, alpha, beta);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A,
+                                    const double* B, MatrixDim dmat) {
+  cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-
-/*
- * cu::
- */
-inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x,
-                            MatrixDim d, int src_stride) {
-  cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
+                                    MatrixDim dmat) {
+  cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
-                             MatrixDim d, int src_stride, int group_size,
-                             float power) {
-  cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const double* B,
+                                    MatrixDim dmat) {
+  cudaFD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                  MatrixDim d, int src_stride, int group_size,
-                                  float power) {
-  cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, float* A, const float* B,
+                                    MatrixDim dmat) {
+  cudaF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
 }
-inline void cuda_group_max(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           MatrixDim d, int src_stride, int group_size) {
-  cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
+  cudaD_copy_low_upp(Gr, Bl, A, dimA);
 }
-inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x,
-                         MatrixDim d, int src_stride) {
-  cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
+  cudaF_copy_low_upp(Gr, Bl, A, dimA);
 }
-inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                              const float *y, MatrixDim d, int e_stride,
-                              int y_stride) {
-  cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst,
+                           const double* const * src, MatrixDim dst_dim) {
+  cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
 }
-inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
-                      int src_stride) {
-  cudaF_tanh(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
-                           const float *y, MatrixDim d, int e_stride,
-                           int y_stride) {
-  cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst,
+                           const float* const * src, MatrixDim dst_dim) {
+  cudaF_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
 }
-inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
-                                 MatrixDim d, int src_stride,
-                                 const float *a, const float *b) {
-  cudaF_parametric_relu(Gr,Bl,y,x,d,src_stride,a,b);
+inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                           int src_stride) {
+  cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout,
-                                      const float *e, const float *y,
-                                      MatrixDim d, int e_stride, int y_stride,
-                                      const float *a, const float *b) {
-  cudaF_diff_parametric_relu(Gr,Bl,eout,e,y,d,e_stride,y_stride,a,b);
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
+                                    MatrixDim d_out, const double *v_in) {
+  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           MatrixDim d, int src_stride) {
-  cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
+                                    MatrixDim d_out, const float *v_in) {
+  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
 }
-// Bl: dimBlock value is fixed min(d.col, CU1DBLOCK), represent CU1DBLOCK
-//     threads reduce a row at the same time.
-// Gr: the number of rows
-inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
-                                MatrixDim d, int src_stride) {
-  cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
+inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, double* const * dst,
+                              const double* src, MatrixDim src_dim) {
+  cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
 }
-inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y,
-                                    const float *x, MatrixDim y_dim,
-                                    int x_stride) {
-  cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
+inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, float* const * dst,
+                              const float* src, MatrixDim src_dim) {
+  cudaF_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
 }
-
-inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad,
-                               float l1, float lr, MatrixDim d,
-                               int stride_grad) {
-  cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
+  cudaD_copy_upp_low(Gr, Bl, A, dimA);
 }
-inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat,
-                                 float *vec_val, int32_cuda *vec_id,
-                                 MatrixDim d) {
-  cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
+inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, float* A, MatrixDim dimA) {
+  cudaF_copy_upp_low(Gr, Bl, A, dimA);
 }
-inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                           float *mat_net_out, float *vec_log_post,
-                           MatrixDim d) {
-  cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
+inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id,
+                                  const double *iv, const double *ov,
+                                  const double* od, MatrixDim id_dim,
+                                  int iv_stride, int ov_stride, int od_stride,
+                                  int group_size, double power) {
+  cudaD_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
+                         od_stride, group_size, power);
 }
-inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
-                                   const float *x, MatrixDim x_d,
-                                   float target_rms, bool add_log_stddev) {
-  cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
-                          add_log_stddev);
+inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, float *id, const float *iv,
+                                  const float *ov, const float* od,
+                                  MatrixDim id_dim, int iv_stride,
+                                  int ov_stride, int od_stride, int group_size,
+                                  float power) {
+  cudaF_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
+                         od_stride, group_size, power);
 }
-inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
-                              const float* value, const int value_stride,
-                              const float* diff, const int diff_stride) {
-  cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
+inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
+                                  const MatrixDim in_deriv_dim,
+                                  const double* out_value,
+                                  const int out_value_stride,
+                                  const double* out_deriv,
+                                  const int out_deriv_stride,
+                                  double* in_deriv) {
+  cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
+                         out_deriv, out_deriv_stride, in_deriv);
 }
 inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
                                   const MatrixDim in_deriv_dim,
@@ -610,79 +625,157 @@ inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
   cudaF_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
                          out_deriv, out_deriv_stride, in_deriv);
 }
-inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
-                                    MatrixDim d_out, const float *v_in) {
-  cudaF_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int num_rows, const double* input,
+                                        const int input_stride,
+                                        const double* params,
+                                        const int params_stride,
+                                        const double* output_deriv,
+                                        const int output_deriv_stride,
+                                        const double* deriv_sum_in,
+                                        const int deriv_sum_in_stride,
+                                        const double* self_repair_config,
+                                        double count, double* input_deriv,
+                                        const int input_deriv_stride,
+                                        double* params_deriv,
+                                        const int params_deriv_stride,
+                                        double* value_sum_out,
+                                        const int value_sum_out_stride,
+                                        double* deriv_sum_out,
+                                        const int deriv_sum_out_stride,
+                                        double* self_repair_sum_out,
+                                        const int self_repair_sum_out_stride) {
+  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+                               params, params_stride, output_deriv,
+                               output_deriv_stride, deriv_sum_in,
+                               deriv_sum_in_stride, self_repair_config, count,
+                               input_deriv, input_deriv_stride, params_deriv,
+                               params_deriv_stride, value_sum_out,
+                               value_sum_out_stride, deriv_sum_out,
+                               deriv_sum_out_stride, self_repair_sum_out,
+                               self_repair_sum_out_stride);
 }
-
-inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
-                           const int32_cuda *copy_from, MatrixDim d_out,
-                           MatrixDim d_in) {
-  cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
+                                        const int num_rows, const float* input,
+                                        const int input_stride,
+                                        const float* params,
+                                        const int params_stride,
+                                        const float* output_deriv,
+                                        const int output_deriv_stride,
+                                        const double* deriv_sum_in,
+                                        const int deriv_sum_in_stride,
+                                        const float* self_repair_config,
+                                        double count, float* input_deriv,
+                                        const int input_deriv_stride,
+                                        float* params_deriv,
+                                        const int params_deriv_stride,
+                                        double* value_sum_out,
+                                        const int value_sum_out_stride,
+                                        double* deriv_sum_out,
+                                        const int deriv_sum_out_stride,
+                                        float* self_repair_sum_out,
+                                        const int self_repair_sum_out_stride) {
+  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
+                               params, params_stride, output_deriv,
+                               output_deriv_stride, deriv_sum_in,
+                               deriv_sum_in_stride, self_repair_config, count,
+                               input_deriv, input_deriv_stride, params_deriv,
+                               params_deriv_stride, value_sum_out,
+                               value_sum_out_stride, deriv_sum_out,
+                               deriv_sum_out_stride, self_repair_sum_out,
+                               self_repair_sum_out_stride);
 }
-
-inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
-                        const int32_cuda *off, MatrixDim d_out,
-                        MatrixDim d_in) {
-  cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, double *id,
+                                        int id_stride, const double *iv,
+                                        MatrixDim iv_dim, const double* od,
+                                        int od_stride, double target_rms,
+                                        bool add_log_stddev) {
+  cudaD_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+                               target_rms, add_log_stddev);
+}
+inline void cuda_diff_normalize_per_row(size_t Gr, size_t Bl, float *id,
+                                        int id_stride, const float *iv,
+                                        MatrixDim iv_dim, const float* od,
+                                        int od_stride, float target_rms,
+                                        bool add_log_stddev) {
+  cudaF_diff_normalize_per_row(Gr, Bl, id, id_stride, iv, iv_dim, od, od_stride,
+                               target_rms, add_log_stddev);
 }
-inline void cuda_one(int Gr, int Bl, float* x, int dim) {
-  cudaF_one(Gr, Bl, x, dim);
+inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout,
+                                      const double *e, const double *y,
+                                      MatrixDim d, int e_stride, int y_stride,
+                                      const double *a, const double *b) {
+  cudaD_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
 }
-inline void cuda_copy(dim3 Gr, dim3 Bl, float *y, const float *x,
-                      const int32_cuda *copy_from, MatrixDim d_out,
-                      MatrixDim d_in) {
-  cudaF_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, float *eout,
+                                      const float *e, const float *y,
+                                      MatrixDim d, int e_stride, int y_stride,
+                                      const float *a, const float *b) {
+  cudaF_diff_parametric_relu(Gr, Bl, eout, e, y, d, e_stride, y_stride, a, b);
 }
-inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const float* x, float* y,
-                              MatrixDim d_out) {
-  cudaF_copy_from_sp(Gr, Bl, x, y, d_out);
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                              const double *y, MatrixDim d, int e_stride,
+                              int y_stride) {
+  cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
-                            MatrixDim d_in) {
-  cudaF_take_lower(Gr, Bl, x, y, d_in);
+inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                              const float *y, MatrixDim d, int e_stride,
+                              int y_stride) {
+  cudaF_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
-                            MatrixDim d_in) {
-  cudaF_take_upper(Gr, Bl, x, y, d_in);
+inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
+                              const double* value, const int value_stride,
+                              const double* diff, const int diff_stride) {
+  cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
 }
-inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
-                           MatrixDim d_in) {
-  cudaF_take_mean(Gr, Bl, x, y, d_in);
+inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, float* x, const MatrixDim dim,
+                              const float* value, const int value_stride,
+                              const float* diff, const int diff_stride) {
+  cudaF_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
 }
-inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data,
-                                     MatrixDim dim, float alpha,
-                                     MatrixElement<float>* x,
-                                     int num_elements) {
-  cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
+                           const double *y, MatrixDim d, int e_stride,
+                           int y_stride) {
+  cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                           float alpha,
-                                           const Int32Pair* indices,
-                                           const float* x, int s, float* data) {
-  cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
+inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, float *eout, const float *e,
+                           const float *y, MatrixDim d, int e_stride,
+                           int y_stride) {
+  cudaF_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
 }
-inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<float>* x,
-                                int32 size, const float* z, MatrixDim d,
-                                float* z2, MatrixDim d2, float* t) {
-  cudaF_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                           double *mat_net_out, double *vec_log_post,
+                           MatrixDim d) {
+  cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
 }
-inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                                   const float *src_data, MatrixDim src_dim,
-                                   const Int32Pair *indices) {
-  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
+                           float *mat_net_out, float *vec_log_post,
+                           MatrixDim d) {
+  cudaF_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
 }
-inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
-                                const float *src_data, MatrixDim src_dim,
-                                const Int32Pair *indexes) {
-  cudaF_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
+inline void cuda_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                              MatrixDim dst_d, int src_stride) {
+  cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
-                               MatrixDim dim, const Int32Pair *indices,
-                               int indices_size, float *output) {
-  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+inline void cuda_div_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                              MatrixDim dst_d, int src_stride) {
+  cudaF_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
+}
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat,
+                              const double *vec_div, MatrixDim d) {
+  cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
+}
+inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, float *mat,
+                              const float *vec_div, MatrixDim d) {
+  cudaF_div_rows_vec(Gr, Bl, mat, vec_div, d);
+}
+inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
+                                    const double *mat2, double *mask,
+                                    MatrixDim mat1_dim, int mat2_stride,
+                                    int mask_stride) {
+  cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
+                           mask_stride);
 }
-
 inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
                                     const float *mat2, float *mask,
                                     MatrixDim mat1_dim, int mat2_stride,
@@ -690,654 +783,544 @@ inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const float *mat1,
   cudaF_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
                            mask_stride);
 }
-
-// double versions
-
-/*
- * CuMatrix
- */
-inline void cuda_copy_upp_low(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
-  cudaD_copy_upp_low(Gr, Bl, A, dimA);
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat,
+                                 double *vec_val, int32_cuda *vec_id,
+                                 MatrixDim d) {
+  cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
 }
-inline void cuda_copy_low_upp(dim3 Gr, dim3 Bl, double* A, MatrixDim dimA) {
-  cudaD_copy_low_upp(Gr, Bl, A, dimA);
+inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const float *mat,
+                                 float *vec_val, int32_cuda *vec_id,
+                                 MatrixDim d) {
+  cudaF_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
 }
-inline void cuda_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                                  MatrixDim mat_dim, const double *vec,
-                                  const double *mat2, int mat2_row_stride,
-                                  int mat2_col_stride, double beta) {
-  cudaD_add_diag_vec_mat(Gr, Bl, alpha, mat, mat_dim, vec, mat2,
-                         mat2_row_stride, mat2_col_stride, beta);
+inline void cuda_group_max(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           MatrixDim d, int src_stride, int group_size) {
+  cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A,
-                                    const double* B, MatrixDim dmat) {
-  cudaD_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_group_max(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           MatrixDim d, int src_stride, int group_size) {
+  cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);
 }
-inline void cuda_copy_from_tp_trans(dim3 Gr, dim3 Bl, double* A, const float* B,
-                                    MatrixDim dmat) {
-  cudaDF_copy_from_tp_trans(Gr, Bl, A, B, dmat);
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                             MatrixDim d, int src_stride, int group_size,
+                             double power) {
+  cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const double* B,
-                              MatrixDim dmat) {
-  cudaD_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+                             MatrixDim d, int src_stride, int group_size,
+                             float power) {
+  cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_copy_from_tp(dim3 Gr, dim3 Bl, double* A, const float* B,
-                              MatrixDim dmat) {
-  cudaDF_copy_from_tp(Gr, Bl, A, B, dmat);
+inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
+                                  MatrixDim d, int src_stride, int group_size,
+                                  double power) {
+  cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
-  cudaD_apply_exp(Gr, Bl, mat, d);
+inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x,
+                                  MatrixDim d, int src_stride, int group_size,
+                                  float power) {
+  cudaF_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
 }
-inline void cuda_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power,
-                           MatrixDim dim) {
-  cudaD_apply_pow(Gr, Bl, mat, power, dim);
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           MatrixDim d, int src_stride) {
+  cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power,
-                               bool include_sign, MatrixDim dim) {
-  cudaD_apply_pow_abs(Gr, Bl, mat, power, include_sign, dim);
+inline void cuda_heaviside(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           MatrixDim d, int src_stride) {
+  cudaF_heaviside(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim dim) {
-  cudaD_apply_heaviside(Gr, Bl, mat, dim);
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
+  cudaD_invert_elements(Gr, Bl, data, d);
 }
-inline void cuda_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val,
-                             MatrixDim dim) {
-  cudaD_apply_floor(Gr, Bl, mat, floor_val, dim);
+inline void cuda_invert_elements(dim3 Gr, dim3 Bl, float *data, MatrixDim d) {
+  cudaF_invert_elements(Gr, Bl, data, d);
 }
-inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat,
-                               double ceiling_val, MatrixDim dim) {
-  cudaD_apply_ceiling(Gr, Bl, mat, ceiling_val, dim);
+inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y,
+                                    const double *x, MatrixDim y_dim,
+                                    int x_stride) {
+  cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
 }
-inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, float *y,
+                                    const float *x, MatrixDim y_dim,
+                                    int x_stride) {
+  cudaF_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
 }
-inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                          const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                          int src_stride) {
-  cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
+                                   const int in_stride, const double* params,
+                                   const int params_stride,
+                                   const int out_stride, const int cell_dim,
+                                   const int num_rows, double* out) {
+  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
+                          out_stride, cell_dim, num_rows, out);
 }
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src,
-                           const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
-                           int src_stride) {
-  cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
+                                   const int in_stride, const float* params,
+                                   const int params_stride,
+                                   const int out_stride, const int cell_dim,
+                                   const int num_rows, float* out) {
+  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
+                          out_stride, cell_dim, num_rows, out);
 }
-inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst,
-                           const double* const * src, MatrixDim dst_dim) {
-  cudaD_copy_rows_direct(Gr, Bl, dst, src, dst_dim);
+inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data,
+                                     MatrixDim dim, double alpha,
+                                     MatrixElement<double>* x,
+                                     int num_elements) {
+  cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
 }
-inline void cuda_copy_to_rows(dim3 Gr, dim3 Bl, double* const * dst,
-                              const double* src, MatrixDim src_dim) {
-  cudaD_copy_to_rows_direct(Gr, Bl, dst, src, src_dim);
+inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, float *data,
+                                     MatrixDim dim, float alpha,
+                                     MatrixElement<float>* x,
+                                     int num_elements) {
+  cudaF_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                          const double* src, const MatrixIndexT_cuda* reorder,
-                          MatrixDim dst_dim, int src_stride) {
-  cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
+inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                           double alpha,
+                                           const Int32Pair* indices,
+                                           const double* x, int s,
+                                           double* data) {
+  cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
-                          const double* const * src, MatrixDim dst_dim) {
-  cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
+inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
+                                           float alpha,
+                                           const Int32Pair* indices,
+                                           const float* x, int s, float* data) {
+  cudaF_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
 }
-inline void cuda_add_to_rows(dim3 Gr, dim3 Bl, double alpha,
-                             double* const * dst, const double* src,
-                             MatrixDim src_dim) {
-  cudaD_add_to_rows_direct(Gr, Bl, alpha, dst, src, src_dim);
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, double *output) {
+  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
-inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) {
-  cudaD_trace(Gr, Bl, mat, value, dim);
+inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const float *data,
+                               MatrixDim dim, const Int32Pair *indices,
+                               int indices_size, float *output) {
+  cudaF_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
 }
-inline void cuda_set_diag(int Gr, int Bl, double* mat, double value,
-                          MatrixDim d) {
-  cudaD_set_diag(Gr, Bl, mat, value, d);
+inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A,
+                     MatrixDim dst_d, int src_stride) {
+  cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value,
-                                 int dim) {
-  cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_max(dim3 Gr, dim3 Bl, float *mat, const float *A,
+                     MatrixDim dst_d, int src_stride) {
+  cudaF_max(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value,
-                                 int dim) {
-  cudaD_add_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                              const MatrixDim d) {
+  cudaD_max_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value,
-                           MatrixDim d) {
-  cudaD_set_const(Gr, Bl, mat, value, d);
+inline void cuda_max_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                              const MatrixDim d) {
+  cudaF_max_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat,
-                                     MatrixDim d) {
-  cudaD_set_zero_above_diag(Gr, Bl, mat, d);
+inline void cuda_min(dim3 Gr, dim3 Bl, double *mat, const double *other,
+                     MatrixDim mat_d, int other_stride) {
+  cudaD_min(Gr, Bl, mat, other, mat_d, other_stride);
 }
-inline void cuda_add(dim3 Gr, dim3 Bl, double *mat, double value, MatrixDim d) {
-  cudaD_add(Gr, Bl, mat, value, d);
+inline void cuda_min(dim3 Gr, dim3 Bl, float *mat, const float *other,
+                     MatrixDim mat_d, int other_stride) {
+  cudaF_min(Gr, Bl, mat, other, mat_d, other_stride);
 }
-inline void cuda_add_vec2(dim3 Gr, dim3 Bl, double *mat, const double *vec,
-                          const double alpha, int dim) {
-  cudaD_add_vec2(Gr, Bl, mat, vec, alpha, dim);
+inline void cuda_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                              const MatrixDim d) {
+  cudaD_min_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_scale_diag_packed(int Gr, int Bl, double* mat, double value,
-                                   int dim) {
-  cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
+inline void cuda_min_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                              const MatrixDim d) {
+  cudaF_min_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value,
-                       MatrixDim d) {
-  cudaD_scale(Gr, Bl, mat, value, d);
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat,
+                              const double *scale, MatrixDim d) {
+  cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
 }
-inline void cuda_apply_log(dim3 Gr, dim3 Bl, double *mat, MatrixDim d) {
-  cudaD_apply_log(Gr, Bl, mat, d);
+inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
+                              MatrixDim d) {
+  cudaF_mul_cols_vec(Gr, Bl, mat, scale, d);
 }
 inline void cuda_mul_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
                               MatrixDim dst_d, int src_stride) {
   cudaD_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
 }
-inline void cuda_div_elements(dim3 Gr, dim3 Bl, double *mat, const double *A,
+inline void cuda_mul_elements(dim3 Gr, dim3 Bl, float *mat, const float *A,
                               MatrixDim dst_d, int src_stride) {
-  cudaD_div_elements(Gr, Bl, mat, A, dst_d, src_stride);
-}
-inline void cuda_max(dim3 Gr, dim3 Bl, double *mat, const double *A,
-                     MatrixDim dst_d, int src_stride) {
-  cudaD_max(Gr, Bl, mat, A, dst_d, src_stride);
-}
-inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat,
-                              const double *scale, MatrixDim d) {
-  cudaD_mul_cols_vec(Gr, Bl, mat, scale, d);
-}
-inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat,
-                              const double *scale, MatrixDim d) {
-  cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
+  cudaF_mul_elements(Gr, Bl, mat, A, dst_d, src_stride);
 }
 inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y,
                                     const double *x, MatrixDim d,
                                     int src_stride, int group_size) {
   cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
 }
-
-inline void cuda_diff_group_pnorm(dim3 Gr, dim3 Bl, double *id,
-                                  const double *iv, const double *ov,
-                                  const double* od, MatrixDim id_dim,
-                                  int iv_stride, int ov_stride, int od_stride,
-                                  int group_size, double power) {
-  cudaD_diff_group_pnorm(Gr, Bl, id, iv, ov, od, id_dim, iv_stride, ov_stride,
-                         od_stride, group_size, power);
+inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x,
+                                    MatrixDim d, int src_stride,
+                                    int group_size) {
+  cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size);
 }
-inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y,
-                                      const double *x1, const double *x2,
-                                      MatrixDim y_dim, int x1_stride,
-                                      int x2_stride, int group_size) {
-  cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, y_dim, x1_stride, x2_stride,
-                             group_size);
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat,
+                              const double *scale, MatrixDim d) {
+  cudaD_mul_rows_vec(Gr, Bl, mat, scale, d);
 }
-inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src,
-                         double *dst, MatrixDim d, int src_stride,
-                         int A_trans) {
-  cudaD_add_mat(Gr, Bl, alpha, src, dst, d, src_stride, A_trans);
+inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
+                              MatrixDim d) {
+  cudaF_mul_rows_vec(Gr, Bl, mat, scale, d);
 }
-inline void cuda_add_mat_blocks(dim3 Gr, dim3 Bl, double alpha,
-                                const double *src, int32_cuda num_row_blocks,
-                                int32_cuda num_col_blocks, double *dst,
-                                MatrixDim d, int src_stride, int A_trans) {
-  cudaD_add_mat_blocks(Gr, Bl, alpha, src, num_row_blocks, num_col_blocks, dst,
-                       d, src_stride, A_trans);
+inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y,
+                                   int y_stride, const double *x, MatrixDim x_d,
+                                   double target_rms, bool add_log_stddev) {
+  cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
+                          add_log_stddev);
 }
-inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
-                                     const double *B, const double *C,
-                                     double *dst, MatrixDim d, int stride_a,
-                                     int stride_b, int stride_c) {
-  cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
-                            stride_c);
+inline void cuda_normalize_per_row(size_t Gr, size_t Bl, float *y, int y_stride,
+                                   const float *x, MatrixDim x_d,
+                                   float target_rms, bool add_log_stddev) {
+  cudaF_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
+                          add_log_stddev);
 }
-inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha,
-                                 const double *col, double beta, double *dst,
-                                 MatrixDim d) {
-  cudaD_add_vec_to_cols(Gr, Bl, alpha, col, beta, dst, d);
+inline void cuda_one(int Gr, int Bl, double* x, int dim) {
+  cudaD_one(Gr, Bl, x, dim);
 }
-inline void cuda_add_vec_to_rows(dim3 Gr, dim3 Bl, double alpha,
-                                 const double *row, double beta, double *dst,
-                                 MatrixDim d) {
-  cudaD_add_vec_to_rows(Gr, Bl, alpha, row, beta, dst, d);
+inline void cuda_one(int Gr, int Bl, float* x, int dim) {
+  cudaF_one(Gr, Bl, x, dim);
 }
-inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
-                            const double* T, MatrixDim tdim, double *S,
-                            MatrixDim sdim) {
-  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
+inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
+                                 MatrixDim d, int src_stride, const double *a,
+                                 const double *b) {
+  cudaD_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
 }
-inline void cuda_add_mat_diag_vec(dim3 Gr, dim3 Bl, double alpha, double *mat,
-                                  MatrixDim mat_dim, const double *mat2,
-                                  int mat2_row_stride, int mat2_col_stride,
-                                  const double *vec, double beta) {
-  cudaD_add_mat_diag_vec(Gr, Bl, alpha, mat, mat_dim, mat2, mat2_row_stride,
-                         mat2_col_stride, vec, beta);
+inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, float *y, const float *x,
+                                 MatrixDim d, int src_stride, const float *a,
+                                 const float *b) {
+  cudaF_parametric_relu(Gr, Bl, y, x, d, src_stride, a, b);
 }
-inline void cuda_add_mat_mat_elements(dim3 Gr, dim3 Bl, double *data,
-                                      const double *srcA_data,
-                                      const double *srcB_data, MatrixDim dim,
-                                      int srcA_stride, int srcB_stride,
-                                      double alpha, double beta) {
-  cudaD_add_mat_mat_elements(Gr, Bl, data, srcA_data, srcB_data, dim,
-                             srcA_stride, srcB_stride, alpha, beta);
+inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
+                           const int32_cuda *copy_from, MatrixDim d_out,
+                           MatrixDim d_in) {
+  cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-
-/*
- * CuVector
- */
-
-inline void cuda_max_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                              const MatrixDim d) {
-  cudaD_max_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_randomize(dim3 Gr, dim3 Bl, float *y, const float *x,
+                           const int32_cuda *copy_from, MatrixDim d_out,
+                           MatrixDim d_in) {
+  cudaF_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
 }
-inline void cuda_min_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                              const MatrixDim d) {
-  cudaD_min_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad,
+                               double l1, double lr, MatrixDim d,
+                               int stride_grad) {
+  cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
 }
-inline void cuda_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
-                              const MatrixDim d) {
-  cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
+inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, float *wei, float *grad,
+                               float l1, float lr, MatrixDim d,
+                               int stride_grad) {
+  cudaF_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
 }
 inline void cuda_replace_value(int Gr, int Bl, double *v, int dim, double orig,
                                double changed) {
   cudaD_replace_value(Gr, Bl, v, dim, orig, changed);
 }
-inline void cuda_div_rows_vec(dim3 Gr, dim3 Bl, double *mat,
-                              const double *vec_div, MatrixDim d) {
-  cudaD_div_rows_vec(Gr, Bl, mat, vec_div, d);
+inline void cuda_replace_value(int Gr, int Bl, float *v, int dim, float orig,
+                               float changed) {
+  cudaF_replace_value(Gr, Bl, v, dim, orig, changed);
+}
+inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha,
+                                double *x, int incx) {
+  return cublasDscal_v2(handle, n, &alpha, x, incx);
+}
+inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha,
+                                float *x, int incx) {
+  return cublasSscal_v2(handle, n, &alpha, x, incx);
+}
+inline void cuda_scale_diag_packed(int Gr, int Bl, double* mat, double value,
+                                   int dim) {
+  cudaD_scale_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_scale_diag_packed(int Gr, int Bl, float* mat, float value,
+                                   int dim) {
+  cudaF_scale_diag_packed(Gr, Bl, mat, value, dim);
+}
+inline void cuda_scale(dim3 Gr, dim3 Bl, double *mat, double value,
+                       MatrixDim d) {
+  cudaD_scale(Gr, Bl, mat, value, d);
+}
+inline void cuda_scale(dim3 Gr, dim3 Bl, float *mat, float value, MatrixDim d) {
+  cudaF_scale(Gr, Bl, mat, value, d);
 }
 inline void cuda_set_bias_params(int Gr, int Bl, double* v, const double* a,
                                  double param_1, double param_2, double param_3,
                                  int* flag, int dim) {
   cudaD_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
 }
-inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
-                                  int dim) {
-  cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
+inline void cuda_set_bias_params(int Gr, int Bl, float* v, const float* a,
+                                 float param_1, float param_2, float param_3,
+                                 int* flag, int dim) {
+  cudaF_set_bias_params(Gr, Bl, v, a, param_1, param_2, param_3, flag, dim);
 }
-inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) {
-  cudaD_vec_soft_max(Gr, Bl, v, dim);
+inline void cuda_set_const(dim3 Gr, dim3 Bl, double *mat, double value,
+                           MatrixDim d) {
+  cudaD_set_const(Gr, Bl, mat, value, d);
 }
-inline void cuda_vec_min(int Gr, int Bl, const double* v, double* value,
-                         int dim, int inc) {
-  cudaD_vec_min(Gr, Bl, v, value, dim, inc);
+inline void cuda_set_const(dim3 Gr, dim3 Bl, float *mat, float value,
+                           MatrixDim d) {
+  cudaF_set_const(Gr, Bl, mat, value, d);
 }
-inline void cuda_vec_max(int Gr, int Bl, const double* v, double* value,
-                         int dim, int inc) {
-  cudaD_vec_max(Gr, Bl, v, value, dim, inc);
+inline void cuda_set_diag(int Gr, int Bl, double* mat, double value,
+                          MatrixDim d) {
+  cudaD_set_diag(Gr, Bl, mat, value, d);
 }
-inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
-                                     const double* B, MatrixDim dA,
-                                     int B_stride, double* value) {
-  cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_set_diag(int Gr, int Bl, float* mat, float value,
+                          MatrixDim d) {
+  cudaF_set_diag(Gr, Bl, mat, value, d);
 }
-inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A,
-                               const double* B, MatrixDim dA, int B_stride,
-                               double* value) {
-  cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
+inline void cuda_set_diag_packed(int Gr, int Bl, double* mat, double value,
+                                 int dim) {
+  cudaD_set_diag_packed(Gr, Bl, mat, value, dim);
 }
-inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha,
-                                      const double* M, const MatrixDim dim_M,
-                                      const double* N, const int stride_N,
-                                      const double beta, double* v) {
-  cudaD_add_diag_mat_mat_MNT(Gr, Bl, alpha, M, dim_M, N, stride_N, beta, v);
+inline void cuda_set_diag_packed(int Gr, int Bl, float* mat, float value,
+                                 int dim) {
+  cudaF_set_diag_packed(Gr, Bl, mat, value, dim);
 }
-inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha,
-                                      const double* M, const int stride_M,
-                                      const double* N, const MatrixDim dim_N,
-                                      const double beta, double* v) {
-  cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A,
+                                     const double *B, const double *C,
+                                     double *dst, MatrixDim d, int stride_a,
+                                     int stride_b, int stride_c) {
+  cudaD_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
+                            stride_c);
 }
-inline void cuda_add_diag_mat_mat_MN(dim3 Gr, dim3 Bl, const double alpha,
-                                     const double* M, const int stride_M,
-                                     const double* N, const MatrixDim dim_N,
-                                     const double beta, double* v) {
-  cudaD_add_diag_mat_mat_MN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v);
+inline void cuda_set_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A,
+                                     const float *B, const float *C, float *dst,
+                                     MatrixDim d, int stride_a, int stride_b,
+                                     int stride_c) {
+  cudaF_set_mat_mat_div_mat(Gr, Bl, A, B, C, dst, d, stride_a, stride_b,
+                            stride_c);
 }
-inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v,
-                             const double* x, const double* y, double beta,
-                             int dim) {
-  cudaD_add_vec_vec(Gr, Bl, alpha, v, x, y, beta, dim);
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, double* mat,
+                                     MatrixDim d) {
+  cudaD_set_zero_above_diag(Gr, Bl, mat, d);
 }
-inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col,
-                                      const double* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaD_copy_col_from_mat_df(Gr, Bl, v, col, mat, dmat, dim);
+inline void cuda_set_zero_above_diag(dim3 Gr, dim3 Bl, float* mat,
+                                     MatrixDim d) {
+  cudaF_set_zero_above_diag(Gr, Bl, mat, d);
 }
-inline void cuda_copy_col_from_mat_fd(int Gr, int Bl, float* v, int col,
-                                      const double* mat, MatrixDim dmat,
-                                      int dim) {
-  cudaD_copy_col_from_mat_fd(Gr, Bl, v, col, mat, dmat, dim);
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x,
+                         MatrixDim d, int src_stride) {
+  cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim,
-                         int inc) {
-  cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
+inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x,
+                         MatrixDim d, int src_stride) {
+  cudaF_sigmoid(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
-                                           const double *src, int dim) {
-  cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x,
+                            MatrixDim d, int src_stride) {
+  cudaD_soft_hinge(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
-                                 float* num, int dim) {
-  cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
-}
-inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val,
-                                   float* num, int dim) {
-  cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
-}
-inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) {
-  cudaD_vec_apply_exp(Gr, Bl, v, dim);
-}
-inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag,
-                               int dim) {
-  cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
+inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x,
+                            MatrixDim d, int src_stride) {
+  cudaF_soft_hinge(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_invert_elements(dim3 Gr, dim3 Bl, double *data, MatrixDim d) {
-  cudaD_invert_elements(Gr, Bl, data, d);
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y,
+                                const double *x, MatrixDim d, int src_stride) {
+  cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
 }
-// B_trans nonzero if B transposed.
-inline void cuda_add_mat_blockmat(dim3 Gr, dim3 Bl, double *data, MatrixDim d,
-                                  const double *Adata, int A_num_rows,
-                                  int A_num_cols, int A_row_stride,
-                                  int A_col_stride,
-                                  const CuBlockMatrixData *B_cu_data,
-                                  int B_num_blocks, double alpha, double beta,
-                                  int B_trans) {
-  cudaD_add_mat_blockmat(Gr, Bl, data, d, Adata, A_num_rows, A_num_cols,
-                         A_row_stride, A_col_stride, B_cu_data, B_num_blocks,
-                         alpha, beta, B_trans);
+inline void cuda_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x,
+                                MatrixDim d, int src_stride) {
+  cudaF_softmax_reduce(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl,
-                                   CuBlockMatrixData *B_cu_data, int num_blocks,
-                                   const double *C_data, int C_num_cols,
-                                   int C_row_stride, int C_col_stride,
-                                   const double *D_data, int D_row_stride,
-                                   int D_col_stride, double alpha,
-                                   double beta) {
-  cudaD_block_add_mat_mat(Gr, Bl, B_cu_data, num_blocks, C_data, C_num_cols,
-                          C_row_stride, C_col_stride, D_data, D_row_stride,
-                          D_col_stride, alpha, beta);
+inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
+                        const int32_cuda *off, MatrixDim d_out,
+                        MatrixDim d_in) {
+  cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
 }
-
-/*
- * cu::
- */
-inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x,
-                            MatrixDim d, int src_stride) {
-  cudaD_soft_hinge(Gr, Bl, y, x, d, src_stride);
+inline void cuda_splice(dim3 Gr, dim3 Bl, float *y, const float *x,
+                        const int32_cuda *off, MatrixDim d_out,
+                        MatrixDim d_in) {
+  cudaF_splice(Gr, Bl, y, x, off, d_out, d_in);
 }
-inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                             MatrixDim d, int src_stride, int group_size,
-                             double power) {
-  cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data,
+                                   MatrixDim dim, const double *src_data,
+                                   MatrixDim src_dim,
+                                   const Int32Pair *indices) {
+  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
 }
-inline void cuda_group_spec_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x,
-                                  MatrixDim d, int src_stride, int group_size,
-                                  double power) {
-  cudaD_group_spec_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);
+inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, float *data, MatrixDim dim,
+                                   const float *src_data, MatrixDim src_dim,
+                                   const Int32Pair *indices) {
+  cudaF_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
 }
-inline void cuda_group_max(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           MatrixDim d, int src_stride, int group_size) {
-  cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size);
+inline void cuda_sum_mat_cols(int Gr, int Bl, double* result, const double* mat,
+                              const MatrixDim d) {
+  cudaD_sum_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x,
-                         MatrixDim d, int src_stride) {
-  cudaD_sigmoid(Gr, Bl, y, x, d, src_stride);
+inline void cuda_sum_mat_cols(int Gr, int Bl, float* result, const float* mat,
+                              const MatrixDim d) {
+  cudaF_sum_mat_cols(Gr, Bl, result, mat, d);
 }
-inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                              const double *y, MatrixDim d, int e_stride,
-                              int y_stride) {
-  cudaD_diff_sigmoid(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, double alpha, double beta,
+                            const double* T, MatrixDim tdim, double *S,
+                            MatrixDim sdim) {
+  cudaD_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
 }
-inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
-                      int src_stride) {
-  cudaD_tanh(Gr, Bl, y, x, d, src_stride);
+inline void cuda_sy_add_tr2(dim3 Gr, dim3 Bl, float alpha, float beta,
+                            const float* T, MatrixDim tdim, float *S,
+                            MatrixDim sdim) {
+  cudaF_sy_add_tr2(Gr, Bl, alpha, beta, T, tdim, S, sdim);
 }
-inline void cuda_diff_tanh(dim3 Gr, dim3 Bl, double *eout, const double *e,
-                           const double *y, MatrixDim d, int e_stride,
-                           int y_stride) {
-  cudaD_diff_tanh(Gr, Bl, eout, e, y, d, e_stride, y_stride);
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
+                            MatrixDim d_in) {
+  cudaD_take_lower(Gr, Bl, x, y, d_in);
 }
-inline void cuda_parametric_relu(dim3 Gr, dim3 Bl, double *y, const double *x,
-                                 MatrixDim d, int src_stride,
-                                 const double *a, const double *b) {
-  cudaD_parametric_relu(Gr,Bl,y,x,d,src_stride,a,b);
+inline void cuda_take_lower(dim3 Gr, dim3 Bl, const float* x, float* y,
+                            MatrixDim d_in) {
+  cudaF_take_lower(Gr, Bl, x, y, d_in);
 }
-inline void cuda_diff_parametric_relu(dim3 Gr, dim3 Bl, double *eout,
-                                      const double *e, const double *y,
-                                      MatrixDim d, int e_stride, int y_stride,
-                                      const double *a, const double *b) {
-  cudaD_diff_parametric_relu(Gr,Bl,eout,e,y,d,e_stride,y_stride,a,b);
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
+                           MatrixDim d_in) {
+  cudaD_take_mean(Gr, Bl, x, y, d_in);
 }
-inline void cuda_heaviside(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           MatrixDim d, int src_stride) {
-  cudaD_heaviside(Gr, Bl, y, x, d, src_stride);
+inline void cuda_take_mean(dim3 Gr, dim3 Bl, const float* x, float* y,
+                           MatrixDim d_in) {
+  cudaF_take_mean(Gr, Bl, x, y, d_in);
 }
-inline void cuda_softmax_reduce(size_t Gr, size_t Bl, double *y,
-                                const double *x, MatrixDim d, int src_stride) {
-  cudaD_softmax_reduce(Gr, Bl, y, x, d, src_stride);
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
+                            MatrixDim d_in) {
+  cudaD_take_upper(Gr, Bl, x, y, d_in);
 }
-inline void cuda_log_softmax_reduce(size_t Gr, size_t Bl, double *y,
-                                    const double *x, MatrixDim y_dim,
-                                    int x_stride) {
-  cudaD_log_softmax_reduce(Gr, Bl, y, x, y_dim, x_stride);
+inline void cuda_take_upper(dim3 Gr, dim3 Bl, const float* x, float* y,
+                            MatrixDim d_in) {
+  cudaF_take_upper(Gr, Bl, x, y, d_in);
 }
-inline void cuda_normalize_per_row(size_t Gr, size_t Bl, double *y,
-                                   int y_stride, const double *x, MatrixDim x_d,
-                                   double target_rms, bool add_log_stddev) {
-  cudaD_normalize_per_row(Gr, Bl, y, y_stride, x, x_d, target_rms,
-                          add_log_stddev);
+inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d,
+                      int src_stride) {
+  cudaD_tanh(Gr, Bl, y, x, d, src_stride);
 }
-
-inline void cuda_regularize_l1(dim3 Gr, dim3 Bl, double *wei, double *grad,
-                               double l1, double lr, MatrixDim d,
-                               int stride_grad) {
-  cudaD_regularize_l1(Gr, Bl, wei, grad, l1, lr, d, stride_grad);
+inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
+                      int src_stride) {
+  cudaF_tanh(Gr, Bl, y, x, d, src_stride);
 }
-inline void cuda_find_row_max_id(dim3 Gr, dim3 Bl, const double *mat,
-                                 double *vec_val, int32_cuda *vec_id,
-                                 MatrixDim d) {
-  cudaD_find_row_max_id(Gr, Bl, mat, vec_val, vec_id, d);
+inline void cuda_trace(int Gr, int Bl, double* mat, double* value, int dim) {
+  cudaD_trace(Gr, Bl, mat, value, dim);
 }
-inline void cuda_diff_xent(dim3 Gr, dim3 Bl, const int32_cuda *vec_tgt,
-                           double *mat_net_out, double *vec_log_post,
-                           MatrixDim d) {
-  cudaD_diff_xent(Gr, Bl, vec_tgt, mat_net_out, vec_log_post, d);
+inline void cuda_trace(int Gr, int Bl, float* mat, float* value, int dim) {
+  cudaF_trace(Gr, Bl, mat, value, dim);
 }
-inline void cuda_diff_softmax(dim3 Gr, dim3 Bl, double* x, const MatrixDim dim,
-                              const double* value, const int value_stride,
-                              const double* diff, const int diff_stride) {
-  cudaD_diff_softmax(Gr, Bl, x, dim, value, value_stride, diff, diff_stride);
+inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const double* A,
+                               const double* B, MatrixDim dA, int B_stride,
+                               double* value) {
+  cudaD_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_diff_log_softmax(dim3 Gr, dim3 Bl,
-                                  const MatrixDim in_deriv_dim,
-                                  const double* out_value,
-                                  const int out_value_stride,
-                                  const double* out_deriv,
-                                  const int out_deriv_stride,
-                                  double* in_deriv) {
-  cudaD_diff_log_softmax(Gr, Bl, in_deriv_dim, out_value, out_value_stride,
-                         out_deriv, out_deriv_stride, in_deriv);
+inline void cuda_trace_mat_mat(dim3 Gr, dim3 Bl, const float* A, const float* B,
+                               MatrixDim dA, int B_stride, float* value) {
+  cudaF_trace_mat_mat(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_copy_rows_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                                    MatrixDim d_out, const double *v_in) {
-  cudaD_copy_rows_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const double* A,
+                                     const double* B, MatrixDim dA,
+                                     int B_stride, double* value) {
+  cudaD_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
 }
-
-inline void cuda_randomize(dim3 Gr, dim3 Bl, double *y, const double *x,
-                           const int32_cuda *copy_from, MatrixDim d_out,
-                           MatrixDim d_in) {
-  cudaD_randomize(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_trace_mat_mat_trans(dim3 Gr, dim3 Bl, const float* A,
+                                     const float* B, MatrixDim dA, int B_stride,
+                                     float* value) {
+  cudaF_trace_mat_mat_trans(Gr, Bl, A, B, dA, B_stride, value);
 }
-inline void cuda_splice(dim3 Gr, dim3 Bl, double *y, const double *x,
-                        const int32_cuda *off, MatrixDim d_out,
-                        MatrixDim d_in) {
-  cudaD_splice(Gr, Bl, y, x, off, d_out, d_in);
+inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const double* mat_in,
+                                const MatrixElement<double>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                double* trace_vec_out) {
+  cudaD_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                       trace_vec_out);
 }
-inline void cuda_one(int Gr, int Bl, double* x, int dim) {
-  cudaD_one(Gr, Bl, x, dim);
+inline void cuda_trace_mat_smat(dim3 Gr, dim3 Bl, const float* mat_in,
+                                const MatrixElement<float>* smat_in,
+                                MatrixDim mat_d_in, MatrixIndexT_cuda smat_d_in,
+                                float* trace_vec_out) {
+  cudaF_trace_mat_smat(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                       trace_vec_out);
 }
-inline void cuda_copy(dim3 Gr, dim3 Bl, double *y, const double *x,
-                      const int32_cuda *copy_from, MatrixDim d_out,
-                      MatrixDim d_in) {
-  cudaD_copy(Gr, Bl, y, x, copy_from, d_out, d_in);
+inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const double* mat_in,
+                                      const MatrixElement<double>* smat_in,
+                                      MatrixDim mat_d_in,
+                                      MatrixIndexT_cuda smat_d_in,
+                                      double* trace_vec_out) {
+  cudaD_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                             trace_vec_out);
 }
-inline void cuda_copy_from_sp(dim3 Gr, dim3 Bl, const double* x, double* y,
-                              MatrixDim d_out) {
-  cudaD_copy_from_sp(Gr, Bl, x, y, d_out);
+inline void cuda_trace_mat_smat_trans(dim3 Gr, dim3 Bl, const float* mat_in,
+                                      const MatrixElement<float>* smat_in,
+                                      MatrixDim mat_d_in,
+                                      MatrixIndexT_cuda smat_d_in,
+                                      float* trace_vec_out) {
+  cudaF_trace_mat_smat_trans(Gr, Bl, mat_in, smat_in, mat_d_in, smat_d_in,
+                             trace_vec_out);
 }
-inline void cuda_take_lower(dim3 Gr, dim3 Bl, const double* x, double* y,
-                            MatrixDim d_in) {
-  cudaD_take_lower(Gr, Bl, x, y, d_in);
+inline void cuda_vec_apply_ceiling(int Gr, int Bl, double* v, double floor_val,
+                                   float* num, int dim) {
+  cudaD_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
 }
-inline void cuda_take_upper(dim3 Gr, dim3 Bl, const double* x, double* y,
-                            MatrixDim d_in) {
-  cudaD_take_upper(Gr, Bl, x, y, d_in);
+inline void cuda_vec_apply_ceiling(int Gr, int Bl, float* v, float floor_val,
+                                   float* num, int dim) {
+  cudaF_vec_apply_ceiling(Gr, Bl, v, floor_val, num, dim);
 }
-inline void cuda_take_mean(dim3 Gr, dim3 Bl, const double* x, double* y,
-                           MatrixDim d_in) {
-  cudaD_take_mean(Gr, Bl, x, y, d_in);
+inline void cuda_vec_apply_exp(int Gr, int Bl, double* v, int dim) {
+  cudaD_vec_apply_exp(Gr, Bl, v, dim);
 }
-inline void cuda_matrix_add_elements(dim3 Gr, dim3 Bl, double *data,
-                                     MatrixDim dim, double alpha,
-                                     MatrixElement<double>* x,
-                                     int num_elements) {
-  cudaD_matrix_add_elements(Gr, Bl, data, dim, alpha, x, num_elements);
+inline void cuda_vec_apply_exp(int Gr, int Bl, float* v, int dim) {
+  cudaF_vec_apply_exp(Gr, Bl, v, dim);
 }
-inline void cuda_matrix_add_indexed_values(dim3 Gr, dim3 Bl, MatrixDim dim,
-                                           double alpha,
-                                           const Int32Pair* indices,
-                                           const double* x, int s,
-                                           double* data) {
-  cudaD_matrix_add_indexed_values(Gr, Bl, dim, alpha, indices, x, s, data);
+inline void cuda_vec_apply_floor(int Gr, int Bl, double* v, double floor_val,
+                                 float* num, int dim) {
+  cudaD_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
 }
-inline void cuda_comp_obj_deriv(dim3 Gr, dim3 Bl, MatrixElement<double>* x,
-                                int32 size, const double* z, MatrixDim d,
-                                double* z2, MatrixDim d2, double* t) {
-  cudaD_comp_obj_deriv(Gr, Bl, x, size, z, d, z2, d2, t);
+inline void cuda_vec_apply_floor(int Gr, int Bl, float* v, float floor_val,
+                                 float* num, int dim) {
+  cudaF_vec_apply_floor(Gr, Bl, v, floor_val, num, dim);
 }
-inline void cuda_sum_column_ranges(dim3 Gr, dim3 Bl, double *data,
-                                   MatrixDim dim, const double *src_data,
-                                   MatrixDim src_dim,
-                                   const Int32Pair *indices) {
-  cudaD_sum_column_ranges(Gr, Bl, data, dim, src_data, src_dim, indices);
+inline void cuda_vec_apply_log(int Gr, int Bl, double* v, double* flag,
+                               int dim) {
+  cudaD_vec_apply_log(Gr, Bl, v, flag, dim);
 }
-inline void cuda_add_row_ranges(dim3 Gr, dim3 Bl, double *data, MatrixDim dim,
-                                const double *src_data, MatrixDim src_dim,
-                                const Int32Pair *indexes) {
-  cudaD_add_row_ranges(Gr, Bl, data, dim, src_data, src_dim, indexes);
+inline void cuda_vec_apply_log(int Gr, int Bl, float* v, float* flag, int dim) {
+  cudaF_vec_apply_log(Gr, Bl, v, flag, dim);
 }
-inline void cuda_matrix_lookup(dim3 Gr, dim3 Bl, const double *data,
-                               MatrixDim dim, const Int32Pair *indices,
-                               int indices_size, double *output) {
-  cudaD_matrix_lookup(Gr, Bl, data, dim, indices, indices_size, output);
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, double *dst,
+                                           const double *src, int dim) {
+  cudaD_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
 }
-
-inline void cuda_equal_element_mask(dim3 Gr, dim3 Bl, const double *mat1,
-                                    const double *mat2, double *mask,
-                                    MatrixDim mat1_dim, int mat2_stride,
-                                    int mask_stride) {
-  cudaD_equal_element_mask(Gr, Bl, mat1, mat2, mask, mat1_dim, mat2_stride,
-                           mask_stride);
+inline void cuda_vec_copy_diag_from_packed(int Gr, int Bl, float *dst,
+                                           const float *src, int dim) {
+  cudaF_vec_copy_diag_from_packed(Gr, Bl, dst, src, dim);
 }
-
-// Also include some template-friendly wrappers of cublas functions:
-inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, float alpha,
-                                const float *x, int incx, float *y, int incy) {
-  return cublasSaxpy_v2(handle, n, &alpha, x, incx, y, incy);
+inline void cuda_vec_max(int Gr, int Bl, const double* v, double* value,
+                         int dim, int inc) {
+  cudaD_vec_max(Gr, Bl, v, value, dim, inc);
 }
-inline cublasStatus_t cuda_axpy(cublasHandle_t handle, int n, double alpha,
-                                const double *x, int incx, double *y,
-                                int incy) {
-  return cublasDaxpy_v2(handle, n, &alpha, x, incx, y, incy);
+inline void cuda_vec_max(int Gr, int Bl, const float* v, float* value, int dim,
+                         int inc) {
+  cudaF_vec_max(Gr, Bl, v, value, dim, inc);
 }
-inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, float alpha,
-                                float *x, int incx) {
-  return cublasSscal_v2(handle, n, &alpha, x, incx);
+inline void cuda_vec_min(int Gr, int Bl, const double* v, double* value,
+                         int dim, int inc) {
+  cudaD_vec_min(Gr, Bl, v, value, dim, inc);
 }
-inline cublasStatus_t cuda_scal(cublasHandle_t handle, int n, double alpha,
-                                double *x, int incx) {
-  return cublasDscal_v2(handle, n, &alpha, x, incx);
+inline void cuda_vec_min(int Gr, int Bl, const float* v, float* value, int dim,
+                         int inc) {
+  cudaF_vec_min(Gr, Bl, v, value, dim, inc);
 }
-
-inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const double* in,
-                                   const int in_stride, const double* params,
-                                   const int params_stride,
-                                   const int out_stride, const int cell_dim,
-                                   const int num_rows, double* out) {
-  cudaD_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+inline void cuda_vec_mul_elements(int Gr, int Bl, double* v, const double* a,
+                                  int dim) {
+  cudaD_vec_mul_elements(Gr, Bl, v, a, dim);
 }
-inline void cuda_lstm_nonlinearity(dim3 Gr, dim3 Bl, const float* in,
-                                   const int in_stride, const float* params,
-                                   const int params_stride,
-                                   const int out_stride, const int cell_dim,
-                                   const int num_rows, float* out) {
-  cudaF_lstm_nonlinearity(Gr, Bl, in, in_stride, params, params_stride,
-                          out_stride, cell_dim, num_rows, out);
+inline void cuda_vec_mul_elements(int Gr, int Bl, float* v, const float* a,
+                                  int dim) {
+  cudaF_vec_mul_elements(Gr, Bl, v, a, dim);
 }
-inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                        const int num_rows, const double* input,
-                                        const int input_stride,
-                                        const double* params,
-                                        const int params_stride,
-                                        const double* output_deriv,
-                                        const int output_deriv_stride,
-                                        const double* deriv_sum_in,
-                                        const int deriv_sum_in_stride,
-                                        const double* self_repair_config,
-                                        double count, double* input_deriv,
-                                        const int input_deriv_stride,
-                                        double* params_deriv,
-                                        const int params_deriv_stride,
-                                        double* value_sum_out,
-                                        const int value_sum_out_stride,
-                                        double* deriv_sum_out,
-                                        const int deriv_sum_out_stride,
-                                        double* self_repair_sum_out,
-                                        const int self_repair_sum_out_stride) {
-  cudaD_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
-                               params, params_stride, output_deriv,
-                               output_deriv_stride, deriv_sum_in,
-                               deriv_sum_in_stride, self_repair_config, count,
-                               input_deriv, input_deriv_stride, params_deriv,
-                               params_deriv_stride, value_sum_out,
-                               value_sum_out_stride, deriv_sum_out,
-                               deriv_sum_out_stride, self_repair_sum_out,
-                               self_repair_sum_out_stride);
+inline void cuda_vec_soft_max(int Gr, int Bl, double* v, int dim) {
+  cudaD_vec_soft_max(Gr, Bl, v, dim);
 }
-inline void cuda_diff_lstm_nonlinearity(dim3 Gr, dim3 Bl, const int cell_dim,
-                                        const int num_rows, const float* input,
-                                        const int input_stride,
-                                        const float* params,
-                                        const int params_stride,
-                                        const float* output_deriv,
-                                        const int output_deriv_stride,
-                                        const double* deriv_sum_in,
-                                        const int deriv_sum_in_stride,
-                                        const float* self_repair_config,
-                                        double count, float* input_deriv,
-                                        const int input_deriv_stride,
-                                        float* params_deriv,
-                                        const int params_deriv_stride,
-                                        double* value_sum_out,
-                                        const int value_sum_out_stride,
-                                        double* deriv_sum_out,
-                                        const int deriv_sum_out_stride,
-                                        float* self_repair_sum_out,
-                                        const int self_repair_sum_out_stride) {
-  cudaF_diff_lstm_nonlinearity(Gr, Bl, cell_dim, num_rows, input, input_stride,
-                               params, params_stride, output_deriv,
-                               output_deriv_stride, deriv_sum_in,
-                               deriv_sum_in_stride, self_repair_config, count,
-                               input_deriv, input_deriv_stride, params_deriv,
-                               params_deriv_stride, value_sum_out,
-                               value_sum_out_stride, deriv_sum_out,
-                               deriv_sum_out_stride, self_repair_sum_out,
-                               self_repair_sum_out_stride);
+inline void cuda_vec_soft_max(int Gr, int Bl, float* v, int dim) {
+  cudaF_vec_soft_max(Gr, Bl, v, dim);
 }
-
-inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, double *mat_out,
-                                    MatrixDim d_out, const double *v_in) {
-  cudaD_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_vec_sum(int Gr, int Bl, double* v, double* value, int dim,
+                         int inc) {
+  cudaD_vec_sum(Gr, Bl, v, value, dim, inc);
 }
-inline void cuda_copy_cols_from_vec(dim3 Gr, dim3 Bl, float *mat_out,
-                                    MatrixDim d_out, const float *v_in) {
-  cudaF_copy_cols_from_vec(Gr, Bl, mat_out, d_out, v_in);
+inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
+                         int inc) {
+  cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
 }
 
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 9a78c652745..21919a83043 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -510,7 +510,7 @@ static void UnitTestCuMathNormalizePerRow() {
 
     BaseFloat gflops = ((BaseFloat) dim * dim * iter)
         / (tim.Elapsed() * 1.0e+09);
-    KALDI_LOG << "For CuMatrix::NormalizePerRow"
+    KALDI_LOG << "For CuMath::NormalizePerRow"
               << (sizeof(Real)==8?"<double>":"<float>") << ", for dim = "
               << dim << ", speed was " << gflops << " gigaflops.";
     if (tim.Elapsed() > 0.05)
@@ -518,6 +518,107 @@ static void UnitTestCuMathNormalizePerRow() {
   }
 }
 
+template<typename Real>
+static void UnitTestCuDiffNormalizePerRow() {
+  for (int32 i = 0; i < 2; i++) {
+    int row = 10 + Rand() % 40;
+    int col = 10 + Rand() % 50;
+
+    Matrix<Real> Hi(row, col);
+    Matrix<Real> Ho(row, col + 1);
+    Matrix<Real> Hid(row, col);
+    Matrix<Real> Hod(row, col + 1);
+    Hi.SetRandn();
+    Hod.SetRandn();
+    Hi.Scale(5.0);
+
+    CuMatrix<Real> Di(row, col);
+    CuMatrix<Real> Do(row, col + 1);
+    CuMatrix<Real> Did(row, col);
+    CuMatrix<Real> Dod(row, col + 1);
+    Di.CopyFromMat(Hi);
+    Dod.CopyFromMat(Hod);
+
+    Real target_rms = 0.3456;
+    bool add_log_stddev = true;
+    const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+
+    //gpu
+    cu::DiffNormalizePerRow(Di, Dod, target_rms, add_log_stddev, &Did);
+
+    //cpu
+    {
+      MatrixBase<Real>* in_deriv = &Hid;
+      MatrixBase<Real>& out_deriv(Hod);
+      MatrixBase<Real>& in_value(Hi);
+
+      const SubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
+                                             0, in_value.NumCols());
+      Vector<Real> dot_products(out_deriv.NumRows());
+      dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
+                                 kTrans, 0.0);
+      Vector<Real> in_norm(in_value.NumRows());
+      Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
+      in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+      if (add_log_stddev) {
+        Vector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+        // f = log(sqrt(max(epsi, x^T x / D)))
+        // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
+        // we don't compute this exactly below for the case when x^2 x is very
+        // small, but we do make sure that the deriv isn't infinity when the input
+        // is zero.
+        log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
+        log_stddev_deriv.ApplyPow(-1.0);
+        out_deriv_for_stddev.CopyColFromMat(out_deriv,
+                                            (out_deriv.NumCols() - 1));
+        log_stddev_deriv.MulElements(out_deriv_for_stddev);
+        if (in_deriv)
+          in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans,
+                                  1.0);
+      }
+      in_norm.Scale(1.0 / d_scaled);
+      in_norm.ApplyFloor(kSquaredNormFloor);
+      in_norm.ApplyPow(-0.5);
+      if (in_deriv) {
+        if (in_deriv->Data() != out_deriv_no_log.Data())
+          in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans,
+                                  1.0);
+        else
+          in_deriv->MulRowsVec(in_norm);
+        in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+        in_norm.ApplyPow(3.0);
+        dot_products.MulElements(in_norm);
+
+        in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value,
+                                kNoTrans, 1.0);
+      }
+
+      Matrix<Real> Hid2(Did);
+      AssertEqual(Hid, Hid2, 0.00001);
+    }
+  }
+
+  for (int dim = 16; dim <= 1024; dim *= 2) {
+    BaseFloat time_in_secs = 0.025;
+    CuMatrix<Real> id(dim, dim), iv(dim, dim), od(dim, dim + 1);
+    iv.SetRandn();
+    od.SetRandn();
+    Timer tim;
+    int32 iter = 0;
+    for (; tim.Elapsed() < time_in_secs; iter++) {
+      cu::DiffNormalizePerRow(iv, od, Real(0.456), true, &id);
+    }
+    BaseFloat fdim = dim;
+    BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
+    KALDI_LOG << "For CuMath::DiffNormalizePerRow"
+              << (sizeof(Real)==8?"<double>":"<float>")
+              << ", for dim = " << dim << ", speed was " << gflops
+              << " gigaflops.";
+  }
+}
+
+
 
 template<typename Real> void CudaMathUnitTest() {
 #if HAVE_CUDA == 1
@@ -531,14 +632,16 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestLstmNonlinearity();
   UnitTestBackpropLstmNonlinearity<Real>();
   UnitTestCuMathNormalizePerRow<Real>();
+  UnitTestCuDiffNormalizePerRow<Real>();
 }
 
 } // namespace kaldi
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -562,8 +665,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 047e808ae03..2bd184bf116 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -245,7 +245,7 @@ void Randomize(const CuMatrixBase<double> &src,
 template<typename Real>
 void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
                      const bool add_log_stddev, CuMatrixBase<Real>* out) {
-  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
   if (add_log_stddev) {
     KALDI_ASSERT(in.NumRows() == out->NumRows());
     KALDI_ASSERT(in.NumCols() + 1 == out->NumCols());
@@ -291,6 +291,100 @@ void NormalizePerRow(const CuMatrixBase<double>& in, const double target_rms,
                      const bool add_log_stddev, CuMatrixBase<double>* out);
 
 
+// A note on the derivative of NormalizeComponent...
+// let both row_in and row_out be vectors of dimension D.
+// Let p = row_in^T row_in / (D * target_rms^2), and let
+// f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+// row_out = f row_in.
+// Suppose we have a quantity deriv_out which is the derivative
+// of the objective function w.r.t. row_out.  We want to compute
+// deriv_in which is the derivative of the objective function w.r.t.
+// row_in.  Let the objective function be F.  One term is obvious: we have
+// deriv_in = f deriv_out + ....
+// next we have to take into account the derivative that gets back-propagated
+// through f.  Obviously, dF/df = deriv_out^T row_in.
+// And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+// and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+// So this term in dF/d(row_in) equals:
+// dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+// So
+// deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+//  if add_log_stddev_ true, the deriv_in has another term as
+// dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+template<typename Real>
+void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
+                         const CuMatrixBase<Real> &out_deriv,
+                         const Real target_rms, const bool add_log_stddev,
+                         CuMatrixBase<Real>* in_deriv) {
+  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    size_t dimBlock = CU1DBLOCK;
+    size_t dimGrid = in_deriv->NumRows();
+    cuda_diff_normalize_per_row(dimGrid, dimBlock, in_deriv->Data(),
+                                in_deriv->Stride(), in_value.Data(),
+                                in_value.Dim(), out_deriv.Data(),
+                                out_deriv.Stride(), target_rms, add_log_stddev);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    const CuSubMatrix<Real> out_deriv_no_log(out_deriv, 0, out_deriv.NumRows(),
+                                             0, in_value.NumCols());
+    CuVector<Real> dot_products(out_deriv.NumRows());
+    dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans, in_value,
+                               kTrans, 0.0);
+    CuVector<Real> in_norm(in_value.NumRows());
+    Real d_scaled = (in_value.NumCols() * target_rms * target_rms);
+    in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
+
+    if (add_log_stddev) {
+      CuVector<Real> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
+      out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
+      // f = log(sqrt(max(epsi, x^T x / D)))
+      // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
+      // we don't compute this exactly below for the case when x^2 x is very
+      // small, but we do make sure that the deriv isn't infinity when the input
+      // is zero.
+      log_stddev_deriv.ApplyFloor(in_value.NumCols() * kSquaredNormFloor);
+      log_stddev_deriv.ApplyPow(-1.0);
+      out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
+      log_stddev_deriv.MulElements(out_deriv_for_stddev);
+      if (in_deriv)
+        in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
+    }
+    in_norm.Scale(1.0 / d_scaled);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    if (in_deriv) {
+      if (in_deriv->Data() != out_deriv_no_log.Data())
+        in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
+      else
+        in_deriv->MulRowsVec(in_norm);
+      in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+      in_norm.ApplyPow(3.0);
+      dot_products.MulElements(in_norm);
+
+      in_deriv->AddDiagVecMat(-1.0 / d_scaled, dot_products, in_value, kNoTrans,
+                              1.0);
+    }
+  }
+}
+
+template
+void DiffNormalizePerRow(const CuMatrixBase<float> &in_value,
+                         const CuMatrixBase<float> &out_deriv,
+                         const float target_rms, const bool add_log_stddev,
+                         CuMatrixBase<float>* in_deriv);
+template
+void DiffNormalizePerRow(const CuMatrixBase<double> &in_value,
+                         const CuMatrixBase<double> &out_deriv,
+                         const double target_rms, const bool add_log_stddev,
+                         CuMatrixBase<double>* in_deriv);
+
+
 // not calling this Sigmoid to reduce the chance of future collisions.
 template<typename Real>
 static inline Real ScalarSigmoid(Real a) {
@@ -481,15 +575,15 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
     //  Sigmoid(i_t_input), Sigmoid(f_t_input),
     //  Tanh(c_part), Sigmoid(o_t_input),  Tanh(c_t)
     Real i_t_self_repair = (
-        deriv_sum_in(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
+        deriv_sum_in_mat(0, c) / count < sr_config(0) ? sr_config(5) : 0.0);
     Real f_t_self_repair = (
-        deriv_sum_in(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
+        deriv_sum_in_mat(1, c) / count < sr_config(1) ? sr_config(6) : 0.0);
     Real c_part_self_repair = (
-        deriv_sum_in(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
+        deriv_sum_in_mat(2, c) / count < sr_config(2) ? sr_config(7) : 0.0);
     Real o_t_self_repair = (
-        deriv_sum_in(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
+        deriv_sum_in_mat(3, c) / count < sr_config(3) ? sr_config(8) : 0.0);
     Real c_t_self_repair = (
-        deriv_sum_in(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
+        deriv_sum_in_mat(4, c) / count < sr_config(4) ? sr_config(9) : 0.0);
     // Note on how we add self-repair for sigmoids/tanh's.  If self-repair
     // is activated for this unit, then...
     // For sigmoids we'd add -self_repair_scale * (2 * sigmoid(x) - 1.0)
@@ -605,7 +699,7 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
       // deriv_sum_out and deriv_sum_in might point to the same memory.
       for (int32 i = 0; i < 5; i++)
         (*self_repair_sum_out_mat)(i, c) =
-            (deriv_sum_in(i, c) / count < sr_config(i) ? num_rows : 0);
+            (deriv_sum_in_mat(i, c) / count < sr_config(i) ? num_rows : 0);
 
       (*deriv_sum_out_mat)(0, c) += i_t_deriv_sum;
       (*deriv_sum_out_mat)(1, c) += f_t_deriv_sum;
diff --git a/src/cudamatrix/cu-math.h b/src/cudamatrix/cu-math.h
index 9952ca5b9d2..b0e0c2a1ff2 100644
--- a/src/cudamatrix/cu-math.h
+++ b/src/cudamatrix/cu-math.h
@@ -196,24 +196,6 @@ void CpuComputeLstmNonlinearity(const MatrixBase<Real> &input,
                      processed outside this function into self-repair stats for
                      diagnostics.
 */
-/// Normalize nonlinearity modifies the vector of activations
-/// by scaling it so that the root-mean-square equals 1.0.
-///
-/// The output y_i = scale * x_i,
-/// and we want to RMS value of the y_i to equal target_rms,
-/// so y^t y = D * target_rms^2 (if y is one row of the input).
-/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
-/// there is also flooring involved, to avoid division-by-zero
-/// problems.  It's important for the backprop, that the floor's
-/// square root is exactly representable as float.
-/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
-/// is an extra dimension of the output.
-template<typename Real>
-void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
-                     const bool add_log_stddev, CuMatrixBase<Real>* out);
-
-
-
 template<typename Real>
 void BackpropLstmNonlinearity(const CuMatrixBase<Real> &input,
                               const CuMatrixBase<Real> &params,
@@ -241,6 +223,49 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
                                  MatrixBase<double> *deriv_sum_out,
                                  MatrixBase<Real> *self_repair_sum_out);
 
+/// Normalize nonlinearity modifies the vector of activations
+/// by scaling it so that the root-mean-square equals 1.0.
+///
+/// The output y_i = scale * x_i,
+/// and we want to RMS value of the y_i to equal target_rms,
+/// so y^t y = D * target_rms^2 (if y is one row of the input).
+/// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+/// there is also flooring involved, to avoid division-by-zero
+/// problems.  It's important for the backprop, that the floor's
+/// square root is exactly representable as float.
+/// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+/// is an extra dimension of the output.
+template<typename Real>
+void NormalizePerRow(const CuMatrixBase<Real>& in, const Real target_rms,
+                     const bool add_log_stddev, CuMatrixBase<Real>* out);
+
+// A note on the derivative of NormalizeComponent...
+// let both row_in and row_out be vectors of dimension D.
+// Let p = row_in^T row_in / (D * target_rms^2), and let
+// f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+// row_out = f row_in.
+// Suppose we have a quantity deriv_out which is the derivative
+// of the objective function w.r.t. row_out.  We want to compute
+// deriv_in which is the derivative of the objective function w.r.t.
+// row_in.  Let the objective function be F.  One term is obvious: we have
+// deriv_in = f deriv_out + ....
+// next we have to take into account the derivative that gets back-propagated
+// through f.  Obviously, dF/df = deriv_out^T row_in.
+// And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+// and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+// So this term in dF/d(row_in) equals:
+// dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+// So
+// deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+//  if add_log_stddev_ true, the deriv_in has another term as
+// dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+template<typename Real>
+void DiffNormalizePerRow(const CuMatrixBase<Real> &in_value,
+                         const CuMatrixBase<Real> &out_deriv,
+                         const Real target_rms, const bool add_log_stddev,
+                         CuMatrixBase<Real>* in_deriv);
+
+
 } // namespace cu
 } // namespace kaldi
 
diff --git a/src/cudamatrix/cu-matrix-speed-test.cc b/src/cudamatrix/cu-matrix-speed-test.cc
index 032351564c0..5710963254a 100644
--- a/src/cudamatrix/cu-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-matrix-speed-test.cc
@@ -1085,8 +1085,9 @@ template<typename Real> void CudaMatrixSpeedTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
@@ -1103,8 +1104,8 @@ int main() {
 #else
     kaldi::CudaMatrixSpeedTest<double>();
 #endif
-  }
 #if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
   std::cout << "Tests succeeded.\n";
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index a6f84f3f6aa..b0fcdf1d192 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -2707,8 +2707,9 @@ template<typename Real> void CudaMatrixUnitTest() {
 
 
 int main() {
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -2733,9 +2734,9 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
-  SetVerboseLevel(4);
 #if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
+  SetVerboseLevel(4);
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 652364f3dc8..cfa570233c3 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -721,6 +721,31 @@ void CuMatrixBase<Real>::Max(const CuMatrixBase<Real>& A) {
 }
 
 
+template<typename Real>
+void CuMatrixBase<Real>::Min(const CuMatrixBase<Real>& A) {
+  #if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+
+    KALDI_ASSERT(num_cols_ == A.NumCols());
+    KALDI_ASSERT(num_rows_ == A.NumRows());
+
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+
+    cuda_min(dimGrid, dimBlock, data_, A.data_, Dim(), A.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+  #endif
+  {
+    Mat().Min(A.Mat());
+  }
+}
+
+
 template<typename Real>
 void CuMatrixBase<Real>::MulColsVec(const CuVectorBase<Real> &scale) {
 #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index fb26fbf1013..0a4c4b0669e 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -96,7 +96,6 @@ class CuMatrixBase {
   /// Copies column r from column indexes[r] of src.
   /// As a special case, if indexes[i] == -1, sets column i to zero
   /// indexes.size() must equal this->NumCols(),
-  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void CopyCols(const CuMatrixBase<Real> &src,
                 const CuArray<MatrixIndexT> &indexes);
@@ -105,14 +104,12 @@ class CuMatrixBase {
   /// Add column indices[r] of src to column r.
   /// As a special case, if indexes[i] == -1, skip column i
   /// indices.size() must equal this->NumCols(),
-  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
   /// and src.NumRows() must equal this.NumRows()
   void AddCols(const CuMatrixBase<Real> &src,
                const CuArray<MatrixIndexT> &indices);
 
   /// Copies row r from row indexes[r] of src.
-  /// As a special case, if indexes[i] < 0, sets row i to zero
-  /// "reorder".size() must equal this->NumRows(), and
+  /// As a special case, if indexes[i] < 0, sets row i to zero.
   /// src.NumCols() must equal this.NumCols()
   void CopyRows(const CuMatrixBase<Real> &src,
                 const CuArray<MatrixIndexT> &indexes);
@@ -136,9 +133,7 @@ class CuMatrixBase {
 
   /// Does for each row r, this.Row(r) += alpha * src.row(indexes[r]).
   /// If indexes[r] < 0, does not add anything.
-  /// "reorder".size() must equal this->NumRows(),
-  /// all elements of "reorder" must be in [0, src.NumRows()-1],
-  /// and src.NumCols() must equal this.NumCols()
+  /// src.NumCols() must equal this.NumCols()
   void AddRows(Real alpha,
                const CuMatrixBase<Real> &src,
                const CuArray<MatrixIndexT> &indexes);
@@ -410,6 +405,8 @@ class CuMatrixBase {
   void DivElements(const CuMatrixBase<Real> &A);
   /// Do, elementwise, *this = max(*this, A).
   void Max(const CuMatrixBase<Real> &A);
+  /// Do, elementwise, *this = min(*this, A).
+  void Min(const CuMatrixBase<Real> &A);
   /// scale i'th column by scale[i]
   void MulColsVec(const CuVectorBase<Real> &scale);
   /// scale i'th row by scale[i]
diff --git a/src/cudamatrix/cu-rand-speed-test.cc b/src/cudamatrix/cu-rand-speed-test.cc
index 23f82eab977..3c33b780a12 100644
--- a/src/cudamatrix/cu-rand-speed-test.cc
+++ b/src/cudamatrix/cu-rand-speed-test.cc
@@ -214,9 +214,8 @@ int main() {
     kaldi::CuRandGaussianVectorSpeedTest<double>(iter);
     fprintf(stderr, "--- ELAPSED %fs.\n\n", t.Elapsed());
 #if HAVE_CUDA == 1
-  } // NO for loop if 'HAVE_CUDA != 1',
-
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-sp-matrix-speed-test.cc b/src/cudamatrix/cu-sp-matrix-speed-test.cc
index 455bf58608f..ded4baed49b 100644
--- a/src/cudamatrix/cu-sp-matrix-speed-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-speed-test.cc
@@ -146,5 +146,5 @@ int main() {
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().PrintProfile();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
diff --git a/src/cudamatrix/cu-sp-matrix-test.cc b/src/cudamatrix/cu-sp-matrix-test.cc
index 3e3991afc81..c0f1119acea 100644
--- a/src/cudamatrix/cu-sp-matrix-test.cc
+++ b/src/cudamatrix/cu-sp-matrix-test.cc
@@ -363,9 +363,9 @@ template<typename Real, typename OtherReal> void CudaSpMatrixUnitTest() {
 
 int main() {
   using namespace kaldi;
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -394,8 +394,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index 8f885815c72..6514ddbfa87 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -25,6 +25,8 @@
 #include "util/common-utils.h"
 #include "cudamatrix/cu-matrix-lib.h"
 
+using namespace kaldi;
+
 namespace kaldi {
 
 template <typename Real>
@@ -185,19 +187,20 @@ void CudaSparseMatrixUnitTest() {
 
 
 int main() {
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
-    kaldi::CuDevice::Instantiate().SetDebugStrideMode(true);
+  for (; loop < 2; loop++) {
+    CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
-      kaldi::CuDevice::Instantiate().SelectGpuId("no");
+      CuDevice::Instantiate().SelectGpuId("no");
     else
-      kaldi::CuDevice::Instantiate().SelectGpuId("yes");
+      CuDevice::Instantiate().SelectGpuId("yes");
 #endif
 
     kaldi::CudaSparseMatrixUnitTest<float>();
 
 #if HAVE_CUDA == 1
-    if (kaldi::CuDevice::Instantiate().DoublePrecisionSupported()) {
+    if (CuDevice::Instantiate().DoublePrecisionSupported()) {
       kaldi::CudaSparseMatrixUnitTest<double>();
     } else {
       KALDI_WARN << "Double precision not supported";
@@ -210,10 +213,10 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
-  kaldi::SetVerboseLevel(4);
+    SetVerboseLevel(4);
 #if HAVE_CUDA == 1
-  kaldi::CuDevice::Instantiate().PrintProfile();
+  }
+  CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
 }
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index 1298ee5ea5f..4da74871bac 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -121,10 +121,6 @@ class CuSparseMatrix {
 
   ~CuSparseMatrix() { }
 
-  // Use the CuMatrix::CopyFromSmat() function to copy from this to
-  // CuMatrix.
-  // Also see CuMatrix::AddSmat().
-
  protected:
   // The following two functions should only be called if we did not compile
   // with CUDA or could not get a CUDA card; in that case the contents are
diff --git a/src/cudamatrix/cu-test.cc b/src/cudamatrix/cu-test.cc
index c27e2b64691..66b62f097c9 100644
--- a/src/cudamatrix/cu-test.cc
+++ b/src/cudamatrix/cu-test.cc
@@ -575,9 +575,8 @@ static void CuMatrixUnitTest() {
 
 int main() {
   using namespace kaldi;
-
-  for (int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
+  for (int32 loop = 0; loop < 2; loop++) {
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
     else
@@ -593,9 +592,8 @@ int main() {
     {
       kaldi::CuMatrixUnitTest<double>();
     }
-  }
-
 #if HAVE_CUDA == 1
+  }
   kaldi::CuDevice::Instantiate().PrintProfile();
 #endif
   
diff --git a/src/cudamatrix/cu-tp-matrix-test.cc b/src/cudamatrix/cu-tp-matrix-test.cc
index 675cd19a56c..f5018aef6b7 100644
--- a/src/cudamatrix/cu-tp-matrix-test.cc
+++ b/src/cudamatrix/cu-tp-matrix-test.cc
@@ -187,9 +187,9 @@ template<typename Real> void CudaTpMatrixUnitTest() {
 int main() {
   using namespace kaldi;
 
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -211,8 +211,8 @@ int main() {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc
index 81f6f2bf14d..cf3f126937f 100644
--- a/src/cudamatrix/cu-vector-speed-test.cc
+++ b/src/cudamatrix/cu-vector-speed-test.cc
@@ -319,6 +319,6 @@ int main() {
 #else
   kaldi::CudaVectorSpeedTest<double>();
 #endif
-  std::cout << "Tests succeeded.\n";
+  KALDI_LOG << "Tests succeeded.";
 }
 
diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc
index a17a7baa930..6537bab70c6 100644
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@@ -755,9 +755,10 @@ int main(int argc, char *argv[]) {
     po.PrintUsage();
     exit(1);
   }
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
@@ -765,7 +766,6 @@ int main(int argc, char *argv[]) {
       CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
 
-
     kaldi::CuVectorUnitTest<float>();
 #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().DoublePrecisionSupported()) {
@@ -781,8 +781,8 @@ int main(int argc, char *argv[]) {
       KALDI_LOG << "Tests without GPU use succeeded.";
     else
       KALDI_LOG << "Tests with GPU use (if available) succeeded.";
-  }
 #if HAVE_CUDA == 1
+  }
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index e6aa72249f7..b825b9c0a6e 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -1173,19 +1173,25 @@ void CuVectorBase<Real>::AddRowSumMat(Real alpha, const CuMatrixBase<Real> &mat,
 
 }
 
-
 template<typename Real>
-void CuVectorBase<Real>::AddColSumMat(Real alpha,
-                                      const CuMatrixBase<Real> &mat,
+void CuVectorBase<Real>::AddColSumMat(Real alpha, const CuMatrixBase<Real> &mat,
                                       Real beta) {
-  KALDI_ASSERT(mat.NumRows() == Dim());
-
-  CuVector<Real> ones(mat.NumCols());
-  ones.Set(1.0);
-  this->AddMatVec(alpha, mat, kNoTrans, ones, beta);
-}
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    Timer tim;
+    KALDI_ASSERT(mat.NumRows() == Dim());
 
+    cuda_add_col_sum_mat(mat.NumRows(), CU1DBLOCK, Data(), mat.Data(),
+                         mat.Dim(), alpha, beta);
+    CU_SAFE_CALL(cudaGetLastError());
 
+    CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
+  } else
+#endif
+  {
+    Vec().AddColSumMat(alpha, mat.Mat(), beta);
+  }
+}
 
 template<typename Real>
 void CuVectorBase<Real>::InvertElements() {
diff --git a/src/decoder/nbest-decoder.h b/src/decoder/nbest-decoder.h
index 8db071d6591..daecc84e7b2 100644
--- a/src/decoder/nbest-decoder.h
+++ b/src/decoder/nbest-decoder.h
@@ -179,7 +179,7 @@ class NBestDecoder {
         continue; // skip that token
       }
       LatticeWeight path_w(lmscore, amscore);
-      CompactLatticeWeight path_weight(path_w, vector<int32>());
+      CompactLatticeWeight path_weight(path_w, std::vector<int32>());
 
       std::vector<CompactLatticeArc*> arcs_reverse; // reverse order output arcs
       // outer loop for word tokens
@@ -230,8 +230,8 @@ class NBestDecoder {
   //    ShortestPath(fst, &fst_one);
   //    ConvertLattice(fst_one, fst_out, true);
   //    return true;
-  //  } 
-  
+  //  }
+
  private:
 
   // TokenStore is a store of linked tokens with its own allocator
@@ -388,7 +388,7 @@ class NBestDecoder {
         return tok2;
       }
     }
-    
+
     inline bool CombineN(Elem *head, Token *new_tok) { // n-best version
       if (!new_tok) return false;
       Elem *e = head;
@@ -435,7 +435,7 @@ class NBestDecoder {
     }
     inline Token* Advance(Token *source, Arc &arc, int32 frame,
                           BaseFloat cutoff) {
-      // compute new weight    
+      // compute new weight
       Weight w = Times(source->c, arc.weight);
       Weight amscore = Weight::One();
       if (arc.ilabel > 0) { // emitting arc
@@ -446,7 +446,7 @@ class NBestDecoder {
       if (w.Value() > cutoff) {  // prune
         return NULL;
       }
-      // create new token  
+      // create new token
       Token *tok;
       if (arc.olabel > 0) { // create new token
         // find or create corresponding Token
@@ -593,10 +593,10 @@ class NBestDecoder {
         // KALDI_ASSERT(state == tok->arc_.nextstate);
         for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
              !aiter.Done(); aiter.Next()) {
-          // for all a in A(state)    
+          // for all a in A(state)
           Arc arc = aiter.Value();
           if (arc.ilabel != 0) {  // propagate only emitting
-            Token *new_tok = 
+            Token *new_tok =
                 token_store_.Advance(tok, arc, frame, next_weight_cutoff);
             if (new_tok) {
               Elem *e_found = toks_.Find(arc.nextstate);
@@ -637,7 +637,7 @@ class NBestDecoder {
       queue_.erase(queue_.begin());
       Elem *elem = toks_.Find(state);  // would segfault if state not
       // in toks_ but this can't happen.
-      
+
       // we have to pop all tokens with the same state
       // this may create some unneccessary repetitions, since only the new token
       // needs to be forwarded, but I don't know yet how to solve this
diff --git a/src/doc/Kaldi.pptx b/src/doc/Kaldi.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/KaldiMatrix.pptx b/src/doc/KaldiMatrix.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/KaldiModels.pptx b/src/doc/KaldiModels.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/KaldiScripts.pptx b/src/doc/KaldiScripts.pptx
old mode 100755
new mode 100644
diff --git a/src/doc/README b/src/doc/README
old mode 100755
new mode 100644
diff --git a/src/doc/dnn3_scripts_context.dox b/src/doc/dnn3_scripts_context.dox
index 43ee0d40260..884e8c79f51 100644
--- a/src/doc/dnn3_scripts_context.dox
+++ b/src/doc/dnn3_scripts_context.dox
@@ -49,7 +49,7 @@ namespace nnet3 {
    compute this output without seeing a range of input frames.  For example,
    it may be impossible to compute the output without seeing the range of
    't' values from t = 150 through t = 157.  In this case (glossing over details),
-   we'd say that the network has a \b left-context of 3 and a \b right-context of 4.
+   we'd say that the network has a \b left-context of 4 and a \b right-context of 3.
    The actual computation of the context is a bit more complex as it has to
    take into account special cases like where, say, the behavior for odd and
    even 't' values is different (c.f. Round() descriptors in
diff --git a/src/doc/examples.dox b/src/doc/examples.dox
old mode 100755
new mode 100644
diff --git a/src/doc/get_version_info.sh b/src/doc/get_version_info.sh
index b37ac5f400f..568e53c88dd 100755
--- a/src/doc/get_version_info.sh
+++ b/src/doc/get_version_info.sh
@@ -28,7 +28,11 @@ fi
 
 # Note: when you add new tuples here you'll want to add ndew
 # \htmlinclude directives in versions.dox.
-for tuple in "5.0 master c160a9883"; do
+# the tuples will generally be of the form: "x.x master yyyyyy"
+# where yyyyy is the result of git log -1 src/.version done on
+# that version of Kaldi (we only update the .version file when
+# the major/minor version number changes).
+for tuple in "5.0 master c160a9883" "5.1 master 2145519961"; do
   major_minor_number=$(echo $tuple | awk '{print $1}')  # e.g. 5.0
   branch=$(echo $tuple | awk '{print $2}')  # e.g. 'master', or '5.1' (it's a branch name)
   first_commit=$(echo $tuple | awk '{print $3}')
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 52be3d38bca..799bfb5895f 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -410,6 +410,36 @@ utils/mkgraph.sh $lang_own $model_dir $graph_own_dir || exit 1;
 where $model_dir is the model directory which contains the model "final.mdl"
 and the tree "tree". We now can use $graph_own_dir/HCLG.fst to replace the old
 HCLG.fst.
+
+
+\section online_decoding_nnet3 Online decoding with nnet3 models
+
+Online decoding with nnet3 models is basically the same as with nnet2
+models as described in \ref online_decoding_nnet2.  However, there are
+some limitations as to the model type you can use.  In Kaldi 5.0 and
+earlier, online nnet3 decoding does not support recurrent models.
+In Kaldi 5.1 and later, online nnet3 decoding supports "forward"
+recurrent models such as LSTMs, but not bidirectional ones like BLSTMs.
+In addition, online nnet3 decoding with recurrent
+models may not give optimal results unless
+you use "Kaldi-5.1-style" configuration, including the "decay-time"
+option and specifying --extra-left-context-initial 0; see
+\ref dnn3_scripts_context for more discussions of these issues.
+
+
+Many of the issues in online nnet3 decoding are the same as in nnet2
+decoding and the command lines are quite similar.  For online nnet3
+decoding with Kaldi 5.1 and later, the best example script for online
+decoding including model training is, at the
+time of writing, egs/tedlium/s5_r2/local/chain/tuning/run_tdnn_lstm_1e.sh
+(at the time of writing this is only available in the 'shortcut' branch,
+as Kaldi 5.1 has not yet been merged to master);
+and downloadable models that can be used with online nnet3 decoding, please
+see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
+includes instructions in a README file).
+
+
+
 */
 
 
diff --git a/src/doc/transform.dox b/src/doc/transform.dox
index 6d487722124..dfeaf6f66d5 100644
--- a/src/doc/transform.dox
+++ b/src/doc/transform.dox
@@ -31,7 +31,7 @@ namespace kaldi {
   relate to the commonalities:
    - \ref transform_apply
    - \ref transform_perspk
-   - \ref transform_utt2spk 
+   - \ref transform_utt2spk
    - \ref transform_compose
    - \ref transform_weight
 
@@ -49,8 +49,8 @@ namespace kaldi {
 
   We next discuss regression class trees and transforms that use them:
     - \ref transform_regtree
-    
-    
+
+
   \section transform_apply Applying global linear or affine feature transforms
 
   In the case of feature-space transforms and projections that are global,
@@ -59,22 +59,22 @@ namespace kaldi {
   projection is represented as a matrix by which we will left-multiply a feature vector,
   so the transformed feature is \f$ A x \f$.  An affine transform or projection
   is represented the same way, but we imagine a 1 has been appended to the
-  feature vector, so the transformed feature is 
+  feature vector, so the transformed feature is
   \f$ W \left[ \begin{array}{c} x \\ 1 \end{array} \right] \f$ where
    \f$ W = \left[ A ; b \right] \f$, with A and b being the linear transform
   and the constant offset.
   Note that this convention differs from some of the literature, where the 1 may appear as
-  the first dimension rather than the last.  
+  the first dimension rather than the last.
   Global transforms and projections are generally written
   as a type Matrix<BaseFloat> to a single file, and speaker or utterance-specific
   transforms or projections are stored in a table of such matrices (see \ref io_sec_tables)
-  indexed by speaker-id or utterance-id.  
+  indexed by speaker-id or utterance-id.
 
   Transforms may be applied to features
   using the program transform-feats.  Its syntax is
 \verbatim
  transform-feats <transform> <input-feats> <output-feats>
-\endverbatim 
+\endverbatim
   where <input-feats> is an rspecifier, <output-feats> is an wspecifier, and <transform>
   may be an rxfilename or an rspecifier (see \ref io_sec_specifiers and \ref io_sec_xfilename).
   The program will work out whether the transform
@@ -83,14 +83,14 @@ namespace kaldi {
   This program is typically used as part of a pipe.
   A typical example is:
 \verbatim
- feats="ark:splice-feats scp:data/train.scp ark:- | 
+ feats="ark:splice-feats scp:data/train.scp ark:- |
           transform-feats $dir/0.mat ark:- ark:-|"
  some-program some-args "$feats" some-other-args ...
 \endverbatim
  Here, the file 0.mat contains a single matrix.  An example of applying
  speaker-specific transforms is:
 \verbatim
- feats="ark:add-deltas scp:data/train.scp ark:- | 
+ feats="ark:add-deltas scp:data/train.scp ark:- |
    transform-feats --utt2spk=ark:data/train.utt2spk ark:$dir/0.trans ark:- ark:-|"
  some-program some-args "$feats" some-other-args ...
 \endverbatim
@@ -98,33 +98,33 @@ A per-utterance example would be as above but removing the --utt2spk option.
 In this example, the archive file 0.trans would contain transforms (e.g. CMLLR transforms)
 indexed by speaker-id, and the file data/train.utt2spk would have
 lines of the form "utt-id spk-id" (see next section for more explanation).
-The program transform-feats does not care how the transformation matrix was 
+The program transform-feats does not care how the transformation matrix was
 estimated, it just applies it to the
 features.  After it has been through all the features it prints out the average
 per-frame log determinant.  This can be useful when comparing objective functions
 (this log determinant would have to be added to the per-frame likelihood printed
 out by programs like gmm-align, gmm-acc-stats, or gmm-decode-kaldi).  If the
 linear part A of the transformation (i.e. ignoring the offset term) is not square,
-then the program will instead print out the per-frame average of 
+then the program will instead print out the per-frame average of
 \f$ \frac{1}{2} \mathbf{logdet} (A A^T) \f$.  It refers to this as the pseudo-log-determinant.
-This is useful in checking convergence of MLLT estimation where the transformation matrix 
+This is useful in checking convergence of MLLT estimation where the transformation matrix
 being applied is the MLLT matrix times an LDA matrix.
 
 \section transform_perspk Speaker-independent versus per-speaker versus per-utterance adaptation
 
 Programs that estimate transforms are generally set up to do a particular kind of
 adaptation, i.e. speaker-independent versus (speaker- or utterance-specific).  For example, LDA
-and MLLT/STC transforms are speaker-independent but fMLLR transforms are speaker- or 
+and MLLT/STC transforms are speaker-independent but fMLLR transforms are speaker- or
 utterance-specific.  Programs that estimate speaker- or utterance-specific transforms
 will work in per-utterance mode by default, but in per-speaker mode if the --spk2utt
-option is supplied (see below).  
+option is supplied (see below).
 
 One program that can accept either speaker-independent or speaker- or utterance-specific
 transforms is transform-feats.  This program detects whether the first argument (the transform)
 is an rxfilename (see \ref io_sec_xfilename)
 or an rspecifier (see \ref io_sec_specifiers).  If the former, it treats it as a speaker-independent
 transform (e.g. a file containing a single matrix).
-If the latter, there are two choices.  If no --utt2spk option is provided, 
+If the latter, there are two choices.  If no --utt2spk option is provided,
 it treats the transform as a table of matrices indexed by utterance id.  If an --utt2spk option is provided
 (utt2spk is a table of strings indexed by utterance that contains the string-valued speaker id),
 then the transforms are assumed to be indexed by speaker id, and the table
@@ -133,13 +133,13 @@ provided to the --utt2spk option is used to map each utterance to a speaker id.
 \section transform_utt2spk Utterance-to-speaker and speaker-to-utterance maps
 
  At this point we give a general overview of the --utt2spk and --spk2utt options.
- These options are accepted by programs that deal with transformations; they are used when 
+ These options are accepted by programs that deal with transformations; they are used when
  you are doing per-speaker (as opposed to per-utterance) adaptation.
  Typically programs that process already-created transforms will need the --utt2spk
- option and programs that create the transforms will need the --spk2utt option. 
+ option and programs that create the transforms will need the --spk2utt option.
  A typical case is that there will be a file called some-directory/utt2spk
  that looks like:
-\verbatim 
+\verbatim
 spk1utt1  spk1
 spk1utt2  spk1
 spk2utt1  spk2
@@ -148,11 +148,11 @@ spk2utt2  spk2
 \endverbatim
 where these strings are just examples, they stand for generic speaker and
 utterance identifiers; and there will be a file called some-directory/spk2utt that looks like:
-\verbatim 
+\verbatim
 spk1 spk1utt1 spk1utt2
 spk2 spk2utt1 spk2utt2
 ...
-\endverbatim 
+\endverbatim
  and you will supply options that look like --utt2spk=ark:some-directory/utt2spk
  or --spk2utt=ark:some-directory/spk2utt.  The 'ark:' prefix is necessary because
  these files are given as rspecifiers by the Table code, and are interpreted as archives
@@ -177,7 +177,7 @@ spk2 spk2utt1 spk2utt2
  for more discussion of this issue.
 
  \section transform_compose Composing transforms
- 
+
  Another program that accepts generic transforms is the program compose-transforms.
  The general syntax is "compose-transforms a b c", and it performs the multiplication
  c = a b (although this involves a little more than matrix multiplication if a is affine).
@@ -197,7 +197,7 @@ spk2 spk2utt1 spk2utt2
  feats="ark:splice-feats scp:data/train.scp ark:- |
          transform-feats 0.mat ark:- ark:- |
          transform-feats ark:1.trans ark:- ark:- |"
- ...         
+ ...
 \endverbatim
  In general, the transforms a and b that are the inputs to compose-transforms
  may be either speaker-independent transforms or speaker- or utterance-specific
@@ -208,11 +208,11 @@ spk2 spk2utt1 spk2utt2
  represent either tables or normal files (i.e. either {r,w}specifiers or {r,w}xfilenames),
  subject to consistency requirements.
 
- If a is an affine transform, in order to perform the composition correctly, compose-transforms 
+ If a is an affine transform, in order to perform the composition correctly, compose-transforms
  needs to know whether b is affine or linear (it does not know this because it does not have access
  to the dimension of the features
  that are transformed by b).  This is controlled by the option --b-is-affine (bool, default false).
- If b is affine but you forget to set this option and a is affine, compose-transforms 
+ If b is affine but you forget to set this option and a is affine, compose-transforms
  will treat b as a linear transform from dimension (the real input feature dimension) plus one,
  and will output a transform whose input dimension is (the real input feature dimension) plus two.  There
  is no way for "transform-feats" to interpret this when it is to be applied to features,
@@ -225,7 +225,7 @@ Eliminating silence frames can be helpful when estimating speaker adaptive
 transforms such as CMLLR.  This even appears to be true when using
 a multi-class approach with a regression tree (for which, see \ref transform_regtree).
 The way we implement this is by weighting down the posteriors associated with
-silence phones.  This takes place as a modification to the \ref hmm_post 
+silence phones.  This takes place as a modification to the \ref hmm_post
 "state-level posteriors".  An extract of a bash shell script that does this
 is below (this script is discussed in more detail in \ref transform_cmllr_global):
 \verbatim
@@ -249,7 +249,7 @@ class LdaEstimate {
   void Accumulate(const VectorBase<BaseFloat> &data, int32 class_id,
                   BaseFloat weight=1.0);
 };
-\endverbatim 
+\endverbatim
 The program acc-lda accumulates LDA statistics using the acoustic states (i.e. pdf-ids) as the
 classes.  It requires the transition model in order to map the alignments (expressed in terms
 of transition-ids) to pdf-ids.  However, it is not limited to a particular type of acoustic model.
@@ -262,16 +262,16 @@ when using LDA as an initialization for HLDA.
 
 \section transform_splice Frame splicing
 
-Frame splicing (e.g. splicing nine consecutive frames together) is typically done 
+Frame splicing (e.g. splicing nine consecutive frames together) is typically done
 to the raw MFCC features prior to LDA.  The program splice-feats does this.  A typical
 line from a script that uses this is the following:
 \verbatim
 feats="ark:splice-feats scp:data/train.scp ark:- |
         transform-feats $dir/0.mat ark:- ark:-|"
 \endverbatim
-and the "feats" variable would later be used as an rspecifier (c.f. \ref io_sec_specifiers) 
+and the "feats" variable would later be used as an rspecifier (c.f. \ref io_sec_specifiers)
 by some program that needs to read features.  In this example we don't specify the number of frames to splice
-together because we are using the defaults (--left-context=4, --right-context=4, or 
+together because we are using the defaults (--left-context=4, --right-context=4, or
 9 frames in total).
 
 \section transform_delta Delta feature computation
@@ -279,7 +279,7 @@ together because we are using the defaults (--left-context=4, --right-context=4,
 Computation of delta features is done by the program add-deltas, which uses the
 function ComputeDeltas.  The delta feature computation has the same default setup
 as HTK's, i.e. to compute the first delta feature we multiply by the features
-by a sliding window of values [ -2, 1, 0, 1, 2 ], and then normalize by 
+by a sliding window of values [ -2, -1, 0, 1, 2 ], and then normalize by
 dividing by (2^2 + 1^2 + 0^2 + 1^2 + 2^2 = 10).  The second delta feature
 is computed by applying the same approach to the first delta feature.  The
 number of frames of context on each side is controlled by --delta-window (default: 2)
@@ -311,9 +311,9 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  case they need to be defined
  slightly differently for the accepted and rejected dimensions.
  Suppose the original feature dimension is D and the
- reduced feature dimension is K.  
+ reduced feature dimension is K.
  Let us forget the iteration superscript r, and use subscript j for state and
- m for Gaussian mixture. 
+ m for Gaussian mixture.
  For accepted dimensions (\f$0 \leq i < K\f$), the statistics are:
  \f[
    \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t) \frac{1}{ \sigma^2_{jm}(i) } (\mu_{jm} - \mathbf{x}(t)) (\mu_{jm} - \mathbf{x}(t))^T
@@ -333,13 +333,13 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  same, so in the code we only store statistics for K+1 rather than D dimensions.
 
  Also, it is convenient for the program that accumulates the statistics to only have
- access to the K-dimensional model, so during HLDA accumulation we accumulate 
+ access to the K-dimensional model, so during HLDA accumulation we accumulate
  statistics sufficient to estimate the K-dimensional means \f$\mu_{jm}\f$, and insead of
- G we accumulate the following statistics: for accepted dimensions (\f$0 \leq i < K\f$), 
+ G we accumulate the following statistics: for accepted dimensions (\f$0 \leq i < K\f$),
  \f[
    \mathbf{S}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t) \frac{1}{ \sigma^2_{jm}(i) }  \mathbf{x}(t) \mathbf{x}(t)^T
  \f]
- and for rejected dimensions \f$K \leq i < D\f$ 
+ and for rejected dimensions \f$K \leq i < D\f$
  \f[
    \mathbf{S}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t)  \mathbf{x}(t) \mathbf{x}(t)^T ,
  \f]
@@ -350,13 +350,13 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  \f]
  and for \f$K \leq i < D\f$,
  \f[
-  \mathbf{G}^{(i)} = \mathbf{S}^{(i)} - \beta \mu \mu^T, 
+  \mathbf{G}^{(i)} = \mathbf{S}^{(i)} - \beta \mu \mu^T,
  \f]
  where \f$ \beta = \sum_{j,m} \gamma_{jm} \f$ is the total count and \f$\mu = \frac{1}{\beta} \sum_{j,m} \mu_{j,m}\f$
  is the global feature mean.   After computing the transform from the G statistics using the same computation as MLLT,
  we output the transform, and we also use the first K rows of the transform to project the means
  into dimension K and write out the transformed model.
- 
+
  The computation described here is fairly slow; it is \f$ O(K^3) \f$ on each frame,
  and K is fairly large (e.g. 117).  This is the price we pay for compact statistics;
  if we stored full mean and variance statistics, the per-frame computation would be \f$O(K^2)\f$.
@@ -366,14 +366,14 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
  the frames.  If this option is activated, we need to store two separate
  versions of the sufficient statistics for the means.  One version of the mean
  statistics, accumulated on the subset, is only used in the HLDA computation, and
- corresponds to the quantities \f$\gamma_{jm}\f$ and \f$\mu_{jm}\f$ in the formulas above. 
+ corresponds to the quantities \f$\gamma_{jm}\f$ and \f$\mu_{jm}\f$ in the formulas above.
  The other version of the mean statistics is accumulated on all the training data
- and is used to write out the transformed model.  
- 
+ and is used to write out the transformed model.
+
  The overall HLDA estimation process is as follows (see rm_recipe_2/scripts/train_tri2j.sh):
     - First initialize it with LDA (we store both the reduced dimension matrix
       and the full matrix).
-    - Start model-building and training process.  On certain (non-consecutive) 
+    - Start model-building and training process.  On certain (non-consecutive)
       iterations where we have decided to do the HLDA update, do the following:
       - Accumulate HLDA statistics (S, plus statistics for the full-dimensional means).
         The program that accumulates these (gmm-acc-hlda) needs the model, the un-transformed features,
@@ -384,14 +384,14 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
         transformation matrix which it needs to start the optimization and to correctly
         report auxiliary function changes.  It outputs the new transform (both full and
         reduced dimension), and the model with newly estimated and transformed means.
-   
+
  \section transform_mllt Global Semi-tied Covariance (STC) / Maximum Likelihood Linear Transform (MLLT) estimation
 
   Global STC/MLLT is a square feature-transformation matrix.  For more details,
-  see "Semi-tied Covariance Matrices for Hidden Markov Models", by Mark Gales, 
+  see "Semi-tied Covariance Matrices for Hidden Markov Models", by Mark Gales,
   IEEE Transactions on Speech and Audio Processing, vol. 7, 1999, pages 272-281.
   Viewing it as a feature-space transform, the objective function is the average
-  per-frame log-likelihood of the transformed features given the model, plus the 
+  per-frame log-likelihood of the transformed features given the model, plus the
   log determinant of the transform.  The means of the model are also rotated by
   transform in the update phase.  The sufficient statistics are the following,
   for \f$ 0 \leq i < D \f$ where D is the feature dimension:
@@ -399,9 +399,9 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
    \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{jm}(t) \frac{1}{ \sigma^2_{jm}(i) } (\mu_{jm} - \mathbf{x}(t)) (\mu_{jm} - \mathbf{x}(t))^T
  \f]
   See the reference, Equations (22) and (23) for the update equations.  These are
-  basically a simplified form of the diagonal row-by-row Constrained MLLR/fMLLR update 
+  basically a simplified form of the diagonal row-by-row Constrained MLLR/fMLLR update
   equations, where the first-order term of the quadratic equation disappears.  Note that
-  our implementation differs from that reference by using a column of the inverse of the matrix 
+  our implementation differs from that reference by using a column of the inverse of the matrix
   rather than the cofactor, since multiplying by the determinant does not make a difference to the
   result and could potentially cause problems with floating-point underflow or overflow.
 
@@ -411,9 +411,9 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
 
   - Estimate the LDA transformation matrix (we only need the first rows of this, not the full matrix).
     Call this matrix \f$\mathbf{M}\f$.
-  - Start a normal model building process, always using features transformed with \f$\mathbf{M}\f$.  
+  - Start a normal model building process, always using features transformed with \f$\mathbf{M}\f$.
     At certain selected iterations (where we will update the MLLT matrix), we do the following:
-      - Accumulate MLLT statistics in the current fully-transformed space 
+      - Accumulate MLLT statistics in the current fully-transformed space
         (i.e., on top of features transformed with \f$\mathbf{M}\f$).  For efficiency we do this using
         a subset of the training data.
       - Do the MLLT update; let this produce a square matrix \f$\mathbf{T}\f$.
@@ -423,34 +423,34 @@ feats="ark:add-deltas --print-args=false scp:data/train.scp ark:- |"
   The programs involved in MLLT estimation are gmm-acc-mllt and est-mllt.  We also need the
   programs gmm-transform-means (to transform the Gaussian means using \f$\mathbf{T}\f$), and
   compose-transforms (to do the multiplication \f$\mathbf{M} \leftarrow \mathbf{T} \mathbf{M} \f$).
-   
+
 
  \section transform_cmllr_global Global CMLLR/fMLLR transforms
 
   Constrained Maximum Likelihood Linear Regression (CMLLR), also known as feature-space MLLR (fMLLR),
   is an affine feature transform of the form \f$ \mathbf{x} \rightarrow \mathbf{A} \mathbf{x}  + \mathbf{b} \f$,
-  which we write in the form  \f$ \mathbf{x} \rightarrow \mathbf{W} \mathbf{x}^+ \f$, where 
+  which we write in the form  \f$ \mathbf{x} \rightarrow \mathbf{W} \mathbf{x}^+ \f$, where
   \f$\mathbf{x}^+ = \left[\begin{array}{c} \mathbf{x} \\ 1 \end{array} \right]\f$ is the feature with
-  a 1 appended.  Note that this differs from some of the literature where the 1 comes first.  
+  a 1 appended.  Note that this differs from some of the literature where the 1 comes first.
 
   For a review paper that explains CMLLR and the estimation techniques we use, see
  "Maximum likelihood linear transformations for HMM-based speech recognition" by Mark Gales,
-  Computer Speech and Language Vol. 12, pages 75-98.  
+  Computer Speech and Language Vol. 12, pages 75-98.
 
   The sufficient statistics we store are:
   \f[ \mathbf{K} = \sum_{t,j,m} \gamma_{j,m}(t) \Sigma_{jm}^{-1} \mu_{jm} \mathbf{x}(t)^+ \f]
   where \f$\Sigma_{jm}^{-1}\f$ is the inverse covariance matrix,
   and for \f$0 \leq i < D \f$ where D is the feature dimension,
-  \f[ \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{j,m}(t) \frac{1}{\sigma^2_{j,m}(i)} \mathbf{x}(t)^+  \left.\mathbf{x}(t)^+\right.^T \f]   
+  \f[ \mathbf{G}^{(i)} = \sum_{t,j,m} \gamma_{j,m}(t) \frac{1}{\sigma^2_{j,m}(i)} \mathbf{x}(t)^+  \left.\mathbf{x}(t)^+\right.^T \f]
 
   Our estimation scheme is the standard one, see Appendix B of the reference (in particular section B.1,
   "Direct method over rows").  We differ by using a column of the inverse in place of the cofactor row,
   i.e. ignoring the factor of the determinant, as it does not affect the result and causes danger of
   numerical underflow or overflow.
 
-  Estimation of global Constrained MLLR (CMLLR) transforms is done by the 
+  Estimation of global Constrained MLLR (CMLLR) transforms is done by the
   class FmllrDiagGmmAccs,
-  and by the program gmm-est-fmllr (also see gmm-est-fmllr-gpost).  The syntax 
+  and by the program gmm-est-fmllr (also see gmm-est-fmllr-gpost).  The syntax
   of gmm-est-fmllr is:
 \verbatim
 gmm-est-fmllr [options] <model-in> <feature-rspecifier> \
@@ -486,27 +486,27 @@ feats="ark:add-deltas --print-args=false scp:data/test.scp ark:- |
 gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
   --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst \
  "$feats" ark,t:$dir/test.tra ark,t:$dir/test.ali 2>$dir/decode.log
-\endverbatim                     
+\endverbatim
 
  \section transform_lvtln Linear VTLN (LVTLN)
 
  In recent years, there have been a number of papers that describe
  implementations of Vocal Tract Length Normalization (VTLN) that
- work out a linear feature transform corresponding to each VTLN 
+ work out a linear feature transform corresponding to each VTLN
  warp factor.  See, for example, ``Using VTLN for broadcast news transcription'',
  by D. Y. Kim, S. Umesh, M. J. F. Gales, T. Hain and P. C. Woodland, ICSLP 2004.
- 
+
  We implement a method in this general category using the class LinearVtln, and programs
  such as gmm-init-lvtln, gmm-train-lvtln-special, and gmm-est-lvtln-trans.
  The LinearVtln object essentially stores a set of linear feature transforms,
  one for each warp factor.  Let these linear feature transform matrices
  be
    \f[\mathbf{A}^{(i)},  0\leq i < N,  \f]
- where for instance we might have \f$N\f$=31, corresponding to 31 different warp 
- factors. We will describe below how we obtain these matrices below.  
+ where for instance we might have \f$N\f$=31, corresponding to 31 different warp
+ factors. We will describe below how we obtain these matrices below.
  The way the speaker-specific transform is estimated is as follows.
  First, we require some kind of model and a corresponding alignment.  In the
- example scripts we do this either with a small monophone model, or with 
+ example scripts we do this either with a small monophone model, or with
  a full triphone model.  From this model and alignment, and using the original,
  unwarped features, we compute the conventional statistics for estimating
  CMLLR.  When computing the LVTLN transform, what we do is take each matrix
@@ -514,33 +514,33 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  maximizes the CMLLR auxiliary function for the transform
   \f$\mathbf{W} = \left[  \mathbf{A}^{(i)} \, ; \, \mathbf{b} \right]\f$.
  This value of \f$\mathbf{W}\f$ that gave the best auxiliary function value
- (i.e. maximizing over i) becomes the transform for that speaker.  Since we 
+ (i.e. maximizing over i) becomes the transform for that speaker.  Since we
  are estimating a mean offset here,
  we are essentially combining a kind of model-based cepstral mean normalization
  (or alternatively an offset-only form of CMLLR) with VTLN warping implemented
- as a linear transform.  This avoids us having to implement mean normalization 
+ as a linear transform.  This avoids us having to implement mean normalization
  as a separate step.
 
  We next describe how we estimate the matrices \f$\mathbf{A}^{(i)}\f$.  We
  don't do this in the same way as described in the referenced paper; our method
  is simpler (and easier to justify).  Here we describe our computation for a
  particular warp factor; in the current scripts we have 31 distinct warp
- factors ranging from 0.85, 0.86, ..., 1.15.  
+ factors ranging from 0.85, 0.86, ..., 1.15.
  We take a subset of feature data (e.g. several tens of utterances),
  and for this subset we compute both the original and transformed features,
  where the transformed features are computed using a conventional VLTN computation
- (see \ref feat_vtln). 
- Call the original and transformed features \f$\mathbf{x}(t)\f$ and \f$\mathbf{y}(t)\f$ respectively, 
+ (see \ref feat_vtln).
+ Call the original and transformed features \f$\mathbf{x}(t)\f$ and \f$\mathbf{y}(t)\f$ respectively,
  where \f$t\f$ will range over the frames of the selected utterances.
  We compute the affine transform that maps \f$\mathbf{x}\f$ to \f$\mathbf{y}\f$ in a least-squares
- sense, i.e. if \f$\mathbf{y}' = \mathbf{A} \mathbf{x} + \mathbf{b}\f$, 
+ sense, i.e. if \f$\mathbf{y}' = \mathbf{A} \mathbf{x} + \mathbf{b}\f$,
  we compute \f$\mathbf{A}\f$ and \f$\mathbf{b}\f$ that minimizes the sum-of-squares
  difference \f$\sum_t (\mathbf{y}'(t) - \mathbf{y}(t) )^T (\mathbf{y}'(t) - \mathbf{y}(t) )\f$.
  Then we normalize the diagonal variance as follows: we compute the
  variance of the original features as \f$\mathbf{\Sigma}^{(x)}\f$ and of the linearly transformed
  features as \f$\mathbf{\Sigma}^{(y')}\f$, and for each dimension index d we multiply the
- d'th row of \f$\mathbf{A}\f$ by 
-  \f$\sqrt{ \frac{\mathbf{\Sigma}^{(x)}_{d,d}}{\mathbf{\Sigma}^{(y')}_{d,d}}}\f$.  
+ d'th row of \f$\mathbf{A}\f$ by
+  \f$\sqrt{ \frac{\mathbf{\Sigma}^{(x)}_{d,d}}{\mathbf{\Sigma}^{(y')}_{d,d}}}\f$.
  The resulting matrix will become \f$\mathbf{A}^{(i)}\f$ for some value of i.
 
  The command-line tools support the option to ignore the log determinant term
@@ -579,8 +579,8 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  are speaker-specific; other quantities (i.e. \f$\mathbf{A}\f$ and
  \f$\mathbf{B}\f$) are global and shared across all speakers.
 
- The most important factor in this equation is the middle one, 
- with the exponential function in it.  
+ The most important factor in this equation is the middle one,
+ with the exponential function in it.
  The factor \f$\mathbf{D}_s\f$ gives us the ability to combine
  model-based mean and optionally variance normalization (i.e. offset-only
  or diagonal-only CMLLR)
@@ -596,7 +596,7 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  there would be no point to this technique as the other quantities in the
  equation would add no degrees of freedom.  The tools support three kinds of
  constraints on \f$\mathbf{D}_s\f$: it may be of the form
- \f$[ {\mathbf I} \, \;\, {\mathbf 0} ]\f$ (no adaptation), or 
+ \f$[ {\mathbf I} \, \;\, {\mathbf 0} ]\f$ (no adaptation), or
  \f$[ {\mathbf I} \, \;\, {\mathbf m} ]\f$ (offset only), or
  \f$[ {\mathrm{diag}}( {\mathbf d} ) \, \;\, {\mathbf m} ]\f$ (diagonal CMLLR);
  this is controlled by the --normalize-type options to the command-line tools.
@@ -613,9 +613,9 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  if we were to warp by a factor f and then a factor g,
  this should be the same as warping by the combined factor
  fg.  Let l = log(f) and m = log(g).  Then we achieve this
- property via the identity 
+ property via the identity
   \f[ \exp( l \mathbf{A} ) \exp( m \mathbf{A}) = \exp( (l+m) \mathbf{A} ) . \f]
- 
+
  The ET computation for a particular speaker is as follows; this assumes we
  are given \f$\mathbf{A}\f$ and \f$\mathbf{B}\f$.  We accumulate conventional
  CMLLR sufficient statistics for the speaker.  In the update phase we iteratively optimize
@@ -636,9 +636,9 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
  \f$\mathbf{B}\f$, or the model.
    - If updating \f$\mathbf{A}\f$, we do this given fixed values of
      \f$t_s\f$ and \f$\mathbf{D}_s\f$.  The update is not guaranteed to
-     converge, but converges rapidly in practice; it's based on a 
+     converge, but converges rapidly in practice; it's based on a
      quadratic "weak-sense auxiliary function"
-     where the quadratic term is obtained using a first-order truncation 
+     where the quadratic term is obtained using a first-order truncation
      of the Taylor series expansion of the matrix exponential function.
      After updating \f$\mathbf{A}\f$, we modify \f$\mathbf{B}\f$ in order
      to renormalize the \f$t_s\f$ to zero; this involves premultiplying
@@ -646,11 +646,11 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
      value of \f$t_s\f$.
 
    - If updating \f$\mathbf{B}\f$, this is also done using fixed values of
-     \f$t_s\f$ and \f$\mathbf{D}_s\f$, and the update is similar to MLLT 
+     \f$t_s\f$ and \f$\mathbf{D}_s\f$, and the update is similar to MLLT
      (a.k.a. global STC).
      For purposes of the accumulation and update, we imagine we are estimating
      an MLLT matrix just to the left of \f$\mathbf{A}\f$, i.e. some matrix
-     \f$\mathbf{C} \in \Re^{D\times D}\f$; let us define 
+     \f$\mathbf{C} \in \Re^{D\times D}\f$; let us define
      \f$\mathbf{C}^+ = \left[ \begin{array}{cc} \mathbf{C} & 0 \\ 0 & 1 \end{array} \right]\f$.
      The transform will be
      \f$\mathbf{W}_s = \mathbf{D}_s \mathbf{C}^+ \exp ( t_s \mathbf{A} ) \mathbf{B}\f$.
@@ -660,24 +660,24 @@ gmm-decode-faster --beam=30.0 --acoustic-scale=0.08333 \
      \f$\exp ( t_s \mathbf{A} ) \mathbf{B}\f$ as a feature-space transform (i.e.
      as part of the features).  After estimating \f$\mathbf{C}\f$, we will use the identity
 \f[
-   \mathbf{C}^+ \exp ( t_s \mathbf{A} ) =  \exp ( t_s \mathbf{C}^+ \mathbf{A}  \left.\mathbf{C}^+\right.^{-1} ) \mathbf{C}^+ 
+   \mathbf{C}^+ \exp ( t_s \mathbf{A} ) =  \exp ( t_s \mathbf{C}^+ \mathbf{A}  \left.\mathbf{C}^+\right.^{-1} ) \mathbf{C}^+
 \f]
   so the update becomes:
 \f[
         \mathbf{A} \leftarrow \mathbf{C}^+ \mathbf{A}  \left.\mathbf{C}^+\right.^{-1} , \ \ \mathbf{B} \leftarrow \mathbf{C}^+ \mathbf{B} .
 \f]
      At this point we need to transform the model means with the matrix
-     \f$\mathbf{C}\f$.  The reader might question how this interacts with the 
+     \f$\mathbf{C}\f$.  The reader might question how this interacts with the
      fact that for estimating \f$\mathbf{C}\f$, we viewed the quantity
      \f$\mathbf{D}_s\f$ as a model-space transform.  If \f$\mathbf{D}_s\f$ only
-     contains a mean offset, we can still prove that the auxiliary function 
+     contains a mean offset, we can still prove that the auxiliary function
      would increase, except we would have to change the offsets appropriately
      (this is not necessary to do explicitly, as we will re-estimate them on
-     the next iteration anyway).  However, if \f$\mathbf{D}_s\f$ has non-unit 
-     diagonal (i.e. is diagonal not offset CMLLR),  this re-estimation process 
-     is not guaranteed to improve the likelihood; the tools will print a warning 
+     the next iteration anyway).  However, if \f$\mathbf{D}_s\f$ has non-unit
+     diagonal (i.e. is diagonal not offset CMLLR),  this re-estimation process
+     is not guaranteed to improve the likelihood; the tools will print a warning
      in this case.  In order to avoid encountering this case, our scripts
-     train in a mode where \f$\mathbf{D}_s\f$ is an offset-only transform; but 
+     train in a mode where \f$\mathbf{D}_s\f$ is an offset-only transform; but
      in test time we allow \f$\mathbf{D}_s\f$ to be a diagonal CMLLR transform, which seems
      to give slightly better results than the offset-only case.
 
@@ -704,7 +704,7 @@ expanded features).  For very fast operation, it is possible to apply these
 approaches using a very tiny model with a phone-based language model, and some of
 our example scripts demonstrate this.  There is also the capability in the
 feature extraction code to subtract the mean on a per-utterance basis (the
---subtract-mean option to compute-mfcc-feats and compute-plp-feats).  
+--subtract-mean option to compute-mfcc-feats and compute-plp-feats).
 
 In order to support per-utterance and per-speaker mean and variance normalization
 we provide the programs compute-cmvn-stats and apply-cmvn.  The program
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index 2c67b2de317..0a16c5f1d3a 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -19,7 +19,7 @@
 
 // note: you have to run the file get_version_info.sh in order
 // to generate the HTML files that we include via \htmlinclude.
-
+// Any time you add a new version you need to edit get_version_info.sh
 
 
 /**
@@ -62,7 +62,8 @@
    \subsection versions_versions_50 Version 5.0
 
   This is the first major/minor version number after introducing the versioning scheme.
-  It is currently available in the 'master' branch on github.
+  The latest revision of version 5.0 is saved as branch "5.0" on github.
+
   Specific patches:
 
   \htmlinclude 5.0.html
@@ -70,8 +71,7 @@
 
    \subsection versions_versions_51 Version 5.1
 
-  Version 5.1 is in preparation and version 5.1.0 does not actually exist yet.
-  You can see the development in the 'shortcut' branch on github.
+  Version 5.1 is the current master branch of Kaldi.
   Some of the major changes introduced in version 5.1 are:
      - Kaldi now requires C++11 to compile, and we support only the latest
        version of OpenFst (1.6.0).  (This simplifies Kaldi's code, and will later
@@ -85,8 +85,12 @@
        in nnet3; this allows faster and more-easily-online decoding for
        recurrent setups (but only unidirectionally-recurrent ones, like LSTMs
        but not BLSTMs).
+     - \ref online_decoding_nnet3 is now rewritten; it's faster and it supports
+       models like LSTMs.
      - The sequence-training scripts in nnet3 are refactored and are now simpler
        and use less disk space.
 
+  \htmlinclude 5.1.html
+
 
 */
diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h
index 287f1bf01f6..bbb24fd8988 100644
--- a/src/feat/feature-window.h
+++ b/src/feat/feature-window.h
@@ -76,7 +76,8 @@ struct FrameExtractionOptions {
     opts->Register("blackman-coeff", &blackman_coeff,
                    "Constant coefficient for generalized Blackman window.");
     opts->Register("round-to-power-of-two", &round_to_power_of_two,
-                   "If true, round window size to power of two.");
+                   "If true, round window size to power of two by zero-padding "
+                   "input to FFT.");
     opts->Register("snip-edges", &snip_edges,
                    "If true, end effects will be handled by outputting only frames that "
                    "completely fit in the file, and the number of frames depends on the "
diff --git a/src/featbin/copy-feats.cc b/src/featbin/copy-feats.cc
index 0fbcca6399a..f1f58653f2f 100644
--- a/src/featbin/copy-feats.cc
+++ b/src/featbin/copy-feats.cc
@@ -102,19 +102,31 @@ int main(int argc, char *argv[]) {
         CompressedMatrixWriter kaldi_writer(wspecifier);
         if (htk_in) {
           SequentialTableReader<HtkMatrixHolder> htk_reader(rspecifier);
-          for (; !htk_reader.Done(); htk_reader.Next(), num_done++)
+          for (; !htk_reader.Done(); htk_reader.Next(), num_done++) {
             kaldi_writer.Write(htk_reader.Key(),
                                CompressedMatrix(htk_reader.Value().first));
+            if (!num_frames_wspecifier.empty())
+              num_frames_writer.Write(htk_reader.Key(),
+                                      htk_reader.Value().first.NumRows());
+          }
         } else if (sphinx_in) {
           SequentialTableReader<SphinxMatrixHolder<> > sphinx_reader(rspecifier);
-          for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++)
+          for (; !sphinx_reader.Done(); sphinx_reader.Next(), num_done++) {
             kaldi_writer.Write(sphinx_reader.Key(),
                                CompressedMatrix(sphinx_reader.Value()));
+            if (!num_frames_wspecifier.empty())
+              num_frames_writer.Write(sphinx_reader.Key(),
+                                      sphinx_reader.Value().NumRows());
+          }
         } else {
           SequentialBaseFloatMatrixReader kaldi_reader(rspecifier);
-          for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++)
+          for (; !kaldi_reader.Done(); kaldi_reader.Next(), num_done++) {
             kaldi_writer.Write(kaldi_reader.Key(),
                                CompressedMatrix(kaldi_reader.Value()));
+            if (!num_frames_wspecifier.empty())
+              num_frames_writer.Write(kaldi_reader.Key(),
+                                      kaldi_reader.Value().NumRows());
+          }
         }
       }
       KALDI_LOG << "Copied " << num_done << " feature matrices.";
diff --git a/src/featbin/extract-rows.cc b/src/featbin/extract-rows.cc
index e14f9cc0e82..e4e2a927e6b 100644
--- a/src/featbin/extract-rows.cc
+++ b/src/featbin/extract-rows.cc
@@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
     KALDI_LOG << "Processed " << num_done << " segments successfully; "
               << "errors on " << num_err;
 
-    return (num_done > 0);
+    return (num_done > 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
diff --git a/src/featbin/shift-feats.cc b/src/featbin/shift-feats.cc
index 7b970e92248..5d392c9d15a 100644
--- a/src/featbin/shift-feats.cc
+++ b/src/featbin/shift-feats.cc
@@ -22,20 +22,41 @@
 #include "util/common-utils.h"
 #include "matrix/kaldi-matrix.h"
 
+namespace kaldi {
+  void ShiftFeatureMatrix(const Matrix<BaseFloat> &src, int32 shift,
+                          Matrix<BaseFloat>* rearranged) {
+    for (int32 r = 0; r < src.NumRows(); r++) {
+      int32 src_r = r - shift;
+      if (src_r < 0) src_r = 0;
+      if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
+      rearranged->Row(r).CopyFromVec(src.Row(src_r));
+    }
+  }
+}
 
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
 
     const char *usage =
-        "Copy features and possibly shift them in time while maintaining the length, e.g.\n"
-        "shift-feats --shift=1 <input-feats> <output-feats> will shift all frames to the\n"
-        "right by one (the first frame would be duplicated).\n"
-        "See also: copy-feats, copy-matrix\n";
+        "Copy features, and possibly shift them while maintaining the "
+        "num-frames.\n"
+        "Usage: shift-feats [options] <feature-rspecifier> "
+        "<feature-wspecifier>\n"
+        "or:  shift-feats [options] <feats-rxfilename> <feats-wxfilename>\n"
+        "e.g.: shift-feats --shift=-1 foo.scp bar.ark\n"
+        "or: shift-feats --shift=1 foo.mat bar.mat\n"
+        "See also: copy-feats, copy-matrix, select-feats, extract-rows,\n"
+        "subset-feats, subsample-feats, splice-feats, paste-feats, "
+        "concat-feats\n";
 
     ParseOptions po(usage);
+    bool binary = true;
     int32 shift = 0;
-    po.Register("shift", &shift, "Number of frames by which to shift the features.");
+    po.Register("shift", &shift, "Number of frames by which to shift the "
+                                 "features.");
+    po.Register("binary", &binary, "Binary-mode output (not relevant if "
+                "writing to archive)");
 
     po.Read(argc, argv);
 
@@ -46,32 +67,40 @@ int main(int argc, char *argv[]) {
 
     int32 num_done = 0, num_err = 0;
 
-    SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
-    BaseFloatMatrixWriter feat_writer(po.GetArg(2));
-
-
-    for (; !feat_reader.Done(); feat_reader.Next()) {
-      const std::string &key = feat_reader.Key();
-      const Matrix<BaseFloat> &src = feat_reader.Value();
-      if (src.NumRows() == 0) {
-        KALDI_WARN << "Empty matrix for key " << key;
-        num_err++;
-        continue;
+    if (ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feat_reader(po.GetArg(1));
+      BaseFloatMatrixWriter feat_writer(po.GetArg(2));
+
+
+      for (; !feat_reader.Done(); feat_reader.Next()) {
+        const std::string &key = feat_reader.Key();
+        const Matrix<BaseFloat> &src = feat_reader.Value();
+        if (src.NumRows() == 0) {
+          KALDI_WARN << "Empty matrix for key " << key;
+          num_err++;
+          continue;
+        }
+        Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
+        ShiftFeatureMatrix(src, shift, &rearranged);
+        feat_writer.Write(key, rearranged);
+        num_done++;
       }
+
+      KALDI_LOG << "Shifted " << num_done << " features by "
+                << shift << " frames; " << num_err << " with errors.";
+      return (num_done > 0 ? 0 : 1);
+    } else {
+      std::string feat_rxfilename = po.GetArg(1),
+                  feat_wxfilename = po.GetArg(2);
+      Matrix<BaseFloat> src;
+      ReadKaldiObject(feat_rxfilename, &src);
+      if (src.NumRows() == 0)
+        KALDI_ERR << "Empty input matrix";
       Matrix<BaseFloat> rearranged(src.NumRows(), src.NumCols());
-      for (int32 r = 0; r < src.NumRows(); r++) {
-        int32 src_r = r - shift;
-        if (src_r < 0) src_r = 0;
-        if (src_r >= src.NumRows()) src_r = src.NumRows() - 1;
-        rearranged.Row(r).CopyFromVec(src.Row(src_r));
-      }
-      feat_writer.Write(key, rearranged);
-      num_done++;
+      ShiftFeatureMatrix(src, shift, &rearranged);
+      WriteKaldiObject(rearranged, feat_wxfilename, binary);
+      // we do not print any log messages here
     }
-
-    KALDI_LOG << "Shifted " << num_done << " features by "
-              << shift << " frames; " << num_err << " with errors.";
-    return (num_done > 0 ? 0 : 1);
   } catch(const std::exception &e) {
     std::cerr << e.what();
     return -1;
@@ -87,4 +116,8 @@ int main(int argc, char *argv[]) {
   1 1
   1 1
   2 2 ]
+
+
+  echo "[ 1 1; 2 2; 3 3 ]" | ./shift-feats --print-args=false --binary=false \
+    --shift=1 - -
 */
diff --git a/src/fgmmbin/fgmm-global-init-from-accs.cc b/src/fgmmbin/fgmm-global-init-from-accs.cc
index 23dc6be75cf..70b43e05d11 100644
--- a/src/fgmmbin/fgmm-global-init-from-accs.cc
+++ b/src/fgmmbin/fgmm-global-init-from-accs.cc
@@ -1,8 +1,8 @@
 // fgmmbin/fgmm-global-init-from-accs.cc
 
-// Copyright 2015 David Snyder
-//           2015 Johns Hopkins University (Author: Daniel Povey)
-//           2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
+// Copyright 2015-2017 David Snyder
+//                2015 Johns Hopkins University (Author: Daniel Povey)
+//                2015 Johns Hopkins University (Author: Daniel Garcia-Romero)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
     }
 
     int32 num_gauss = gmm_accs.NumGauss(), dim = gmm_accs.Dim(),
-          tot_floored = 0, gauss_floored = 0;
+          tot_floored = 0, gauss_floored = 0, tot_low_occ = 0;
 
     FullGmm fgmm(num_components, dim);
 
@@ -69,23 +69,30 @@ int main(int argc, char *argv[]) {
     Matrix<BaseFloat> means(num_gauss, dim);
     std::vector<SpMatrix<BaseFloat> > invcovars;
 
-    BaseFloat occ_sum = gmm_accs.occupancy().Sum();
     for (int32 i = 0; i < num_components; i++) {
-      BaseFloat occ = gmm_accs.occupancy()(i),
-                prob;
-      if (occ_sum > 0.0)
-        prob = occ / occ_sum;
-      else
-        prob = 1.0 / num_gauss;
-      weights(i) = prob;
-
-      Vector<BaseFloat> mean(gmm_accs.mean_accumulator().Row(i));
-      mean.Scale(1.0 / occ);
+      BaseFloat occ = gmm_accs.occupancy()(i);
+      weights(i) = occ;
+      Vector<BaseFloat> mean(dim, kSetZero);
+      SpMatrix<BaseFloat> covar(dim, kSetZero);
+
+      // If the occupancy for a Gaussian is very low, set it to a small value.
+      if (occ < 1e-10) {
+        weights(i) = 1e-10;
+        mean.SetRandn();
+        Vector<BaseFloat> diag(mean.Dim());
+        diag.Set(1.0);
+        covar.AddDiagVec(1.0, diag);
+        tot_low_occ++;
+      // This is the typical case.
+      } else {
+        mean.CopyRowFromMat(gmm_accs.mean_accumulator(), i);
+        mean.Scale(1.0 / occ);
+        covar.CopyFromSp(gmm_accs.covariance_accumulator()[i]);
+        covar.Scale(1.0 / occ);
+        covar.AddVec2(-1.0, mean);  // subtract squared means.
+      }
       means.CopyRowFromVec(mean, i);
 
-      SpMatrix<BaseFloat> covar(gmm_accs.covariance_accumulator()[i]);
-      covar.Scale(1.0 / occ);
-      covar.AddVec2(-1.0, means.Row(i));  // subtract squared means.
       // Floor variance Eigenvalues.
       BaseFloat floor = std::max(
           static_cast<BaseFloat>(gmm_opts.variance_floor),
@@ -98,14 +105,21 @@ int main(int argc, char *argv[]) {
       covar.InvertDouble();
       invcovars.push_back(covar);
     }
+    weights.Scale(1.0 / weights.Sum());
     fgmm.SetWeights(weights);
     fgmm.SetInvCovarsAndMeans(invcovars, means);
     int32 num_bad = fgmm.ComputeGconsts();
     KALDI_LOG << "FullGmm has " << num_bad << " bad GConsts";
+
     if (tot_floored > 0) {
       KALDI_WARN << tot_floored << " variances floored in " << gauss_floored
                  << " Gaussians.";
     }
+    if (tot_low_occ > 0) {
+      KALDI_WARN << tot_low_occ << " out of " << num_gauss
+                 << " Gaussians had very low occupancy.";
+    }
+
     WriteKaldiObject(fgmm, model_out_filename, binary_write);
 
     KALDI_LOG << "Written model to " << model_out_filename;
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index 8d544e40ea0..da26c58edd7 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -15,8 +15,7 @@ BINFILES = fstdeterminizestar  \
            fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops  \
            fstrmepslocal fstcomposecontext fsttablecompose fstrand fstfactor \
            fstdeterminizelog fstphicompose fstrhocompose fstpropfinal fstcopy \
-	         fstpushspecial fsts-to-transcripts fsts-project fsts-union \
-					 fsts-scale fsts-difference
+           fstpushspecial fsts-to-transcripts fsts-project fsts-union
 
 OBJFILES =
 
@@ -27,6 +26,6 @@ LIBFILE =
 
 ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
           ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstbin/fsts-union.cc b/src/fstbin/fsts-union.cc
index 489d7362453..ed68cea76e9 100644
--- a/src/fstbin/fsts-union.cc
+++ b/src/fstbin/fsts-union.cc
@@ -71,9 +71,10 @@ int main(int argc, char *argv[]) {
       } else {
         if (res_key != "") {
           VectorFst<StdArc> out_fst;
-          fst::Minimize(&res_fst);
-          fst::RmEpsilon(&res_fst);
-          fst_writer.Write(res_key, res_fst);
+          fst::Determinize(res_fst, &out_fst);
+          fst::Minimize(&out_fst);
+          fst::RmEpsilon(&out_fst);
+          fst_writer.Write(res_key, out_fst);
           n_out_done++;
         }
         res_fst = fst;
@@ -82,9 +83,10 @@ int main(int argc, char *argv[]) {
     }
     if (res_key != "") {
       VectorFst<StdArc> out_fst;
-      fst::Minimize(&res_fst);
-      fst::RmEpsilon(&res_fst);
-      fst_writer.Write(res_key, res_fst);
+      fst::Determinize(res_fst, &out_fst);
+      fst::Minimize(&out_fst);
+      fst::RmEpsilon(&out_fst);
+      fst_writer.Write(res_key, out_fst);
       n_out_done++;
     }
 
diff --git a/src/fstext/context-fst-inl.h b/src/fstext/context-fst-inl.h
index 5127e7ae584..dc8a4a8370b 100644
--- a/src/fstext/context-fst-inl.h
+++ b/src/fstext/context-fst-inl.h
@@ -31,6 +31,7 @@ namespace fst {
 /// \addtogroup context_fst_group
 /// @{
 
+namespace internal {
 
 template<class Arc, class LabelT>
 typename ContextFstImpl<Arc, LabelT>::StateId
@@ -41,12 +42,6 @@ typename ContextFstImpl<Arc, LabelT>::StateId
   VectorToStateIter iter = state_map_.find(seq);
   if (iter == state_map_.end()) {  // Not already in map.
     StateId this_state_id = (StateId)state_seqs_.size();
-    //This check is not needed with OpenFst >= 1.4
-#ifndef HAVE_OPENFST_GE_10400
-    StateId this_state_id_check = CacheImpl<Arc>::AddState();
-    // goes back to VectorFstBaseImpl<Arc>, inherited via CacheFst<Arc>
-    assert(this_state_id == this_state_id_check);
-#endif
     state_seqs_.push_back(seq);
     state_map_[seq] = this_state_id;
     return this_state_id;
@@ -325,53 +320,26 @@ void ContextFstImpl<Arc, LabelT>::Expand(StateId s) {  // expands arcs only [not
   // We just try adding all possible symbols on the output side.
   Arc arc;
   if (this->CreateArc(s, subsequential_symbol_, &arc)) {
-#ifdef HAVE_OPENFST_GE_10400
     this->PushArc(s, arc);
-#else
-    this->AddArc(s, arc);
-#endif
   }
   for (typename kaldi::ConstIntegerSet<Label>::iterator iter = phone_syms_.begin();
        iter != phone_syms_.end(); ++iter) {
     Label phone = *iter;
     if (this->CreateArc(s, phone, &arc)) {
-#ifdef HAVE_OPENFST_GE_10400
       this->PushArc(s, arc);
-#else
-      this->AddArc(s, arc);
-#endif
     }
   }
   for (typename kaldi::ConstIntegerSet<Label>::iterator iter = disambig_syms_.begin();
        iter != disambig_syms_.end(); ++iter) {
     Label disambig_sym = *iter;
     if (this->CreateArc(s, disambig_sym, &arc)) {
-#ifdef HAVE_OPENFST_GE_10400
       this->PushArc(s, arc);
-#else
-      this->AddArc(s, arc);
-#endif
     }
   }
   this->SetArcs(s);  // mark the arcs as "done". [so HasArcs returns true].
 }
 
-
-template<class Arc, class LabelT>
-ContextFst<Arc, LabelT>::ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset) {
-  if (reset) {
-    impl_ = new ContextFstImpl<Arc, LabelT>(*(fst.impl_));
-    // Copy constructor of ContextFstImpl.
-    // Main use of calling with reset = true is to free up memory
-    // (e.g. then you could delete original one).  Might be useful in transcription
-    // expansion during training.
-  } else {
-    impl_ = fst.impl_;
-    impl_->IncrRefCount();
-  }
-}
-
-
+}  // namespace internal
 
 template<class Arc, class LabelT>
 bool ContextMatcher<Arc, LabelT>::Find(typename Arc::Label match_label) {
diff --git a/src/fstext/context-fst-test.cc b/src/fstext/context-fst-test.cc
index 53c774f829a..a57b7231728 100644
--- a/src/fstext/context-fst-test.cc
+++ b/src/fstext/context-fst-test.cc
@@ -192,11 +192,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
   }
 
   if (verbose) {  // Try to print the fst.
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(cfst, cfst.InputSymbols(), cfst.OutputSymbols(), NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -211,11 +207,7 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Sequence FST is:\n";
       {  // Try to print the fst.
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(*f, f->InputSymbols(), f->OutputSymbols(), NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
     }
@@ -257,13 +249,8 @@ template<class Arc> static void TestContextFst(bool verbose, bool use_matcher) {
     if (verbose) {
       std::cout << "Composed FST is:\n";
       {  // Try to print the fst.
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(fst_composed, fst_composed.InputSymbols(),
                                    fst_composed.OutputSymbols(), NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(fst_composed, fst_composed.InputSymbols(),
-                                   fst_composed.OutputSymbols(), NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
     }
diff --git a/src/fstext/context-fst.h b/src/fstext/context-fst.h
index 15cb0ef9fdb..246dce924b2 100644
--- a/src/fstext/context-fst.h
+++ b/src/fstext/context-fst.h
@@ -48,16 +48,8 @@
    efficient to compose with.
 */
 
-#ifdef _MSC_VER
 #include <unordered_map>
 using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
 
 #include <algorithm>
 #include <string>
@@ -72,14 +64,15 @@ namespace fst {
 /// \addtogroup context_fst_group "Classes and functions related to context expansion"
 /// @{
 
+namespace internal {
+
 /*
    ContextFstImpl inherits from CacheImpl, which handles caching of states.
 */
 
-
-template < class Arc,
-         class LabelT = int32> // make the vector<Label> things actually vector<int32> for
-                             // easier compatibility with Kaldi code.
+template <class Arc,
+          class LabelT = int32> // make the vector<Label> things actually vector<int32> for
+                                // easier compatibility with Kaldi code.
 class ContextFstImpl : public CacheImpl<Arc> {
  public:
 
@@ -94,10 +87,8 @@ class ContextFstImpl : public CacheImpl<Arc> {
   typedef typename Arc::Weight Weight;
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Label Label;
-#ifdef HAVE_OPENFST_GE_10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-#endif
   typedef unordered_map<vector<LabelT>,
                         StateId, kaldi::VectorHasher<LabelT> > VectorToStateType;
   typedef unordered_map<vector<LabelT>,
@@ -119,7 +110,7 @@ class ContextFstImpl : public CacheImpl<Arc> {
   // See \ref tree_ilabel
   // "http://kaldi-asr.org/doc/tree_externals.html#tree_ilabel" for more
   // information about the ilabel_info.
-  const vector<vector<LabelT> > &ILabelInfo() { return ilabel_info_; }
+  const vector<vector<LabelT> > &ILabelInfo() const { return ilabel_info_; }
 
   StateId Start();
 
@@ -192,6 +183,7 @@ class ContextFstImpl : public CacheImpl<Arc> {
   std::string separator_;
 };
 
+}  // namespace internal
 
 /*
    Actual FST for ContextFst.  Most of the work gets done in ContextFstImpl.
@@ -211,114 +203,66 @@ class ContextFstImpl : public CacheImpl<Arc> {
 
 template <class Arc,
           class LabelT = int32> // make the vector<LabelT> things actually vector<int32> for
-                              // easier compatibility with Kaldi code.
-class ContextFst : public Fst<Arc> {
+                                // easier compatibility with Kaldi code.
+class ContextFst : public ImplToFst<internal::ContextFstImpl<Arc, LabelT>> {
  public:
-  friend class ArcIterator< ContextFst<Arc> >;
-  friend class StateIterator< ContextFst<Arc> >;
-#ifndef HAVE_OPENFST_GE_10400
-  // We have to supply the default template argument below to work around a
-  // Visual Studio bug.
-  friend class CacheArcIterator< ContextFst<Arc>,
-                                 DefaultCacheStateAllocator<CacheState<Arc> > >;
-#endif
+  friend class ArcIterator<ContextFst<Arc>>;
+  friend class StateIterator<ContextFst<Arc>>;
 
   typedef typename Arc::Weight Weight;
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;
-#ifdef HAVE_OPENFST_GE_10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-#else
-  typedef CacheState<Arc> State;
-#endif
+  typedef internal::ContextFstImpl<Arc, LabelT> Impl;
 
   /// See \ref graph_context for more details.
   ContextFst(Label subsequential_symbol,  // epsilon not allowed.
              const vector<LabelT>& phones,  // symbols on output side of fst.
              const vector<LabelT>& disambig_syms,  // symbols on output side of fst.
              int32 N,  // Size of context window
-             int32 P):  // Pos of "central" phone in ctx window, from 0..N-1.
-      impl_ (new ContextFstImpl<Arc, LabelT>(subsequential_symbol, phones, disambig_syms, N, P))
-  { assert(std::numeric_limits<LabelT>::is_signed); }
-
-  ContextFst(const ContextFst<Arc, LabelT> &fst, bool reset = false);
+             int32 P)  // Pos of "central" phone in ctx window, from 0..N-1.
+      : ImplToFst<Impl>(std::make_shared<Impl>(
+            subsequential_symbol, phones, disambig_syms, N, P)) {
+    assert(std::numeric_limits<LabelT>::is_signed);
+  }
 
-  virtual ~ContextFst() { if (!impl_->DecrRefCount()) delete impl_;  }
+  ContextFst(const ContextFst<Arc, LabelT> &fst, bool safe = false)
+      : ImplToFst<Impl>(fst, safe) {}
 
-  virtual StateId Start() const { return impl_->Start(); }
+  ContextFst<Arc, LabelT> *Copy(bool safe = false) const override {
+    return new ContextFst<Arc, LabelT>(*this, safe);
+  }
 
-  virtual Weight Final(StateId s) const { return impl_->Final(s); }
+  inline void InitStateIterator(StateIteratorData<Arc> *data) const override;
 
-  StateId NumStates() const { return impl_->NumStates(); }
+  void InitArcIterator(StateId s, ArcIteratorData<Arc> *data) const override {
+    GetMutableImpl()->InitArcIterator(s, data);
+  }
 
   // This function is used in ContextMatcher.
   // Semantically this is not really const, as it causes states to be
   // added to the state table in impl_, and the input vocabulary to be
   // expanded, but C++ lets us make this const, and compose.h
-  // requires it (because it provides const fst's to the Matcher
-  // object.
+  // requires it (because it provides const fst's to the Matcher object.
   bool CreateArc(StateId s, Label olabel, Arc *oarc) const {
-    return impl_->CreateArc(s, olabel, oarc);
-  }
-
-  size_t NumArcs(StateId s) const { return impl_->NumArcs(s); }
-
-  size_t NumInputEpsilons(StateId s) const {
-    return impl_->NumInputEpsilons(s);
-  }
-
-  size_t NumOutputEpsilons(StateId s) const {
-    return impl_->NumOutputEpsilons(s);
-  }
-
-  virtual uint64 Properties(uint64 mask, bool test) const {
-    if (test) {
-      uint64 knownprops, testprops = TestProperties(*this, mask, &knownprops);
-      impl_->SetProperties(knownprops, testprops);
-      return testprops & mask;
-    } else {
-      return impl_->Properties(mask);
-    }
+    return GetMutableImpl()->CreateArc(s, olabel, oarc);
   }
 
   // Careful: the output of ILabelInfo depends on what has been visited.
-  const vector<vector<LabelT> > &ILabelInfo() { return impl_->ILabelInfo(); }
-
-  virtual const string& Type() const { return impl_->Type(); }
-
-  virtual ContextFst<Arc>  *Copy(bool reset = false) const {
-    return new ContextFst<Arc>(*this, reset);
-  }
-
-  virtual const SymbolTable* InputSymbols() const {
-    return impl_->InputSymbols();
+  const vector<vector<LabelT> > &ILabelInfo() const {
+    return GetImpl()->ILabelInfo();
   }
 
-  virtual const SymbolTable* OutputSymbols() const {
-    return impl_->OutputSymbols();
-  }
-
-  virtual inline void InitStateIterator(StateIteratorData<Arc> *data) const;
-
-  virtual void InitArcIterator(StateId s, ArcIteratorData<Arc> *data) const {
-    impl_->InitArcIterator(s, data);
-  }
-
-  friend class CacheStateIterator<ContextFst<Arc> >;  // so it can see impl_.
  private:
-  ContextFstImpl<Arc, LabelT> *impl_;  // protected so CacheStateIterator
-  // Makes visible to friends.
-  ContextFstImpl<Arc, LabelT> *GetImpl() const { return impl_; }
- // would be: ImplToFst<ContextFstImpl<Arc, LabelT> >::GetImpl();
- // but need to convert to using the ImplToFst stuff.
+  using ImplToFst<Impl>::GetImpl;
+  using ImplToFst<Impl>::GetMutableImpl;
 
-  void operator = (const ContextFstImpl<Arc> &fst);  // disallow
+  ContextFst &operator=(const ContextFst &fst) = delete;
 };
 
 /// Useful utility function for writing these vectors to disk.
-/// writes as int32 for binary compatibility since I will typically
-/// be "int".
+/// writes as int32 for binary compatibility since it will typically be "int".
 template<class I>
 void WriteILabelInfo(std::ostream &os, bool binary,
                      const vector<vector<I> > &info);
@@ -346,7 +290,7 @@ class StateIterator< ContextFst<A> >
     : public CacheStateIterator< ContextFst<A> > {
  public:
   explicit StateIterator(const ContextFst<A> &fst)
-    : CacheStateIterator< ContextFst<A> >(fst, fst.GetImpl()) {}
+    : CacheStateIterator< ContextFst<A> >(fst, fst.GetMutableImpl()) {}
 };
 
 
@@ -359,13 +303,10 @@ class ArcIterator< ContextFst<A> >
   typedef typename A::StateId StateId;
 
   ArcIterator(const ContextFst<A> &fst, StateId s)
-    : CacheArcIterator< ContextFst<A> >(fst.GetImpl(), s) {
+    : CacheArcIterator< ContextFst<A> >(fst.GetMutableImpl(), s) {
     if (!fst.GetImpl()->HasArcs(s)) // arcs not already computed.
-      fst.GetImpl()->Expand(s);
+      fst.GetMutableImpl()->Expand(s);
   }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(ArcIterator);
 };
 
 template <class A, class I> inline
diff --git a/src/fstext/determinize-lattice-inl.h b/src/fstext/determinize-lattice-inl.h
index 9aff3e774a4..43ad809f70e 100644
--- a/src/fstext/determinize-lattice-inl.h
+++ b/src/fstext/determinize-lattice-inl.h
@@ -235,7 +235,7 @@ template<class IntType> class LatticeStringRepository {
     }
   }
 
-  DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeStringRepository);
   Entry *new_entry_; // We always have a pre-allocated Entry ready to use,
                      // to avoid unnecessary news and deletes.
   SetType set_;
@@ -1210,7 +1210,7 @@ template<class Weight, class IntType> class LatticeDeterminizer {
     }
   }
 
-  DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizer);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizer);
 
 
   vector<vector<Element>* > output_states_; // maps from output state to
diff --git a/src/fstext/determinize-lattice-test.cc b/src/fstext/determinize-lattice-test.cc
index a12e368ea86..7359fa1354d 100644
--- a/src/fstext/determinize-lattice-test.cc
+++ b/src/fstext/determinize-lattice-test.cc
@@ -75,7 +75,7 @@ template<class Arc> void TestDeterminizeLattice() {
   typedef typename Arc::Weight Weight;
   typedef int32 Int;
   typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
-  
+
   for(int i = 0; i < 100; i++) {
     RandFstOptions opts;
     opts.n_states = 4;
@@ -84,34 +84,26 @@ template<class Arc> void TestDeterminizeLattice() {
     opts.allow_empty = false;
     opts.weight_multiplier = 0.5; // impt for the randomly generated weights
     // to be exactly representable in float,
-    // or this test fails because numerical differences can cause symmetry in 
+    // or this test fails because numerical differences can cause symmetry in
     // weights to be broken, which causes the wrong path to be chosen as far
     // as the string part is concerned.
-    
+
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> det_fst;
     try {
       DeterminizeLatticeOptions lat_opts;
       lat_opts.max_mem = 100;
-      
+
       if (!DeterminizeLattice<TropicalWeight, int32>(*fst, &det_fst, lat_opts, NULL))
         throw std::runtime_error("could not determinize");
       std::cout << "FST after lattice-determinizing is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       assert(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
@@ -122,11 +114,7 @@ template<class Arc> void TestDeterminizeLattice() {
       ConvertLattice<Weight, Int>(*fst, &compact_fst, false);
       std::cout << "Compact FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       if (kaldi::Rand() % 2 == 1)
@@ -134,17 +122,13 @@ template<class Arc> void TestDeterminizeLattice() {
       else
         if (!DeterminizeLattice<TropicalWeight, int32>(*fst, &compact_det_fst, lat_opts, NULL))
           throw std::runtime_error("could not determinize");
-      
+
       std::cout << "Compact version of determinized FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
-      
+
       assert(RandEquivalent(compact_det_fst, compact_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
       std::cout << "Failed to lattice-determinize this FST (probably not determinizable)\n";
@@ -162,22 +146,14 @@ template<class Arc> void TestDeterminizeLattice2() {
     VectorFst<Arc> *fst = RandFst<Arc>(opts);
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLattice<TropicalWeight, int32>(*fst, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     delete fst;
diff --git a/src/fstext/determinize-star-inl.h b/src/fstext/determinize-star-inl.h
index ea599008e56..fcba5ef8ea2 100644
--- a/src/fstext/determinize-star-inl.h
+++ b/src/fstext/determinize-star-inl.h
@@ -24,16 +24,8 @@
 
 #include "base/kaldi-error.h"
 
-#ifdef _MSC_VER
 #include <unordered_map>
 using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
 
 #include <vector>
 #include <climits>
@@ -137,7 +129,7 @@ template<class Label, class StringId> class StringRepository {
   }
 
  private:
-  DISALLOW_COPY_AND_ASSIGN(StringRepository);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(StringRepository);
 
   StringId IdOfSeqInternal(const vector<Label> &v) {
     typename MapType::iterator iter = map_.find(&v);
@@ -609,7 +601,7 @@ template<class F> class DeterminizerStar {
 
   void Debug();
 
-  DISALLOW_COPY_AND_ASSIGN(DeterminizerStar);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DeterminizerStar);
   deque<pair<vector<Element>*, OutputStateId> > Q_;  // queue of subsets to be processed.
 
   vector<vector<TempArc> > output_arcs_;  // essentially an FST in our format.
diff --git a/src/fstext/determinize-star-test.cc b/src/fstext/determinize-star-test.cc
index d6aaaa4e024..ee150f0c024 100644
--- a/src/fstext/determinize-star-test.cc
+++ b/src/fstext/determinize-star-test.cc
@@ -37,11 +37,7 @@ template<class Arc> void TestDeterminizeGeneral() {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
@@ -49,14 +45,10 @@ template<class Arc> void TestDeterminizeGeneral() {
       DeterminizeStar<Fst<Arc> >(*fst, &ofst, kDelta, NULL, max_states);
       std::cout << "FST after determinizing is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
-      assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));      
+      assert(RandEquivalent(*fst, ofst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
     } catch (...) {
       std::cout << "Failed to determinize *this FST (probably not determinizable)\n";
     }
@@ -108,11 +100,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -120,11 +108,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -137,11 +121,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -157,11 +137,7 @@ template<class Arc>  void TestDeterminize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   VectorFst<Arc> ofst_orig;
@@ -180,22 +156,14 @@ template<class Arc>  void TestDeterminize() {
 
   {
     std::cout <<" printing after determinization [baseline]\n";
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     assert(ofst_orig.Properties(kIDeterministic, true) == kIDeterministic);
   }
 
   {
     std::cout <<" printing after determinization [star]\n";
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     assert(ofst_star.Properties(kIDeterministic, true) == kIDeterministic);
   }
@@ -205,11 +173,7 @@ template<class Arc>  void TestDeterminize() {
   int64 num_removed = DeleteISymbols(&ofst_star, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -277,11 +241,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -289,11 +249,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -310,11 +266,7 @@ template<class Arc>  void TestPush() {
 
   std::cout <<" printing after pushing\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst_pushed, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -367,11 +319,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -379,11 +327,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -396,11 +340,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -416,11 +356,7 @@ template<class Arc>  void TestMinimize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   VectorFst<Arc> ofst_orig;
@@ -433,11 +369,7 @@ template<class Arc>  void TestMinimize() {
   }
   {
     std::cout <<" printing after determinization [baseline]\n";
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_orig, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -449,11 +381,7 @@ template<class Arc>  void TestMinimize() {
     DeterminizeStar(*fst, &gallic_fst);
     {
       std::cout <<" printing after determinization by DeterminizeStar [in gallic]\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -463,11 +391,7 @@ template<class Arc>  void TestMinimize() {
 
     {
       std::cout <<" printing after pushing weights [in gallic]\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -476,45 +400,24 @@ template<class Arc>  void TestMinimize() {
     Minimize(&gallic_fst);
     {
       std::cout <<" printing after  minimization [in gallic]\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
     printf("Converting gallic back to regular [my approach]\n");
-#ifdef HAVE_OPENFST_GE_10400
     TrivialFactorWeightFst< GallicArc<Arc, GALLIC_LEFT>, GallicFactor<typename Arc::Label,
         typename Arc::Weight, GALLIC_LEFT> > fwfst(gallic_fst);
-#else
-    TrivialFactorWeightFst< GallicArc<Arc, STRING_LEFT>, GallicFactor<typename Arc::Label,
-        typename Arc::Weight, STRING_LEFT> > fwfst(gallic_fst);
-#endif
     {
       std::cout <<" printing factor-weight FST\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc< Arc> > fstprinter(fwfst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
-#ifdef HAVE_OPENFST_GE_10400
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
-#else
-    Map(fwfst, &ofst_star, FromGallicMapper<Arc, STRING_LEFT>());
-#endif
 
     {
       std::cout <<" printing after converting back to regular FST\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -527,11 +430,7 @@ template<class Arc>  void TestMinimize() {
   int64 num_removed = DeleteISymbols(&ofst_star, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
diff --git a/src/fstext/factor-test.cc b/src/fstext/factor-test.cc
index 9416f6fa4a4..cb021ab4643 100644
--- a/src/fstext/factor-test.cc
+++ b/src/fstext/factor-test.cc
@@ -78,11 +78,7 @@ template<class Arc> static void TestFactor() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -90,11 +86,7 @@ template<class Arc> static void TestFactor() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
diff --git a/src/fstext/fstext-utils-test.cc b/src/fstext/fstext-utils-test.cc
index 7f63d83186b..b016b53691f 100644
--- a/src/fstext/fstext-utils-test.cc
+++ b/src/fstext/fstext-utils-test.cc
@@ -146,11 +146,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -158,11 +154,7 @@ template<class Arc>  void TestSafeDeterminizeWrapper() {  // also tests SafeDete
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -221,7 +213,7 @@ template<class Arc>  void TestAcceptorMinimize() {
   RemoveWeights(fst);
 
   VectorFst<Arc> fst2(*fst);
-  AcceptorMinimize(&fst2);
+  internal::AcceptorMinimize(&fst2);
 
   assert(RandEquivalent(*fst, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
 
@@ -376,11 +368,7 @@ void TestEqualAlign() {
 
 template<class Arc> void Print(const Fst<Arc> &fst, std::string message) {
   std::cout << message << "\n";
-#ifdef HAVE_OPENFST_GE_10400
   FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true, "\t");
-#else
-  FstPrinter<Arc> fstprinter(fst, NULL, NULL, NULL, false, true);
-#endif
   fstprinter.Print(&std::cout, "standard output");
 }
 
@@ -451,5 +439,3 @@ int main() {
     fst::TestRemoveUselessArcs<fst::StdArc>();
   }
 }
-
-
diff --git a/src/fstext/fstext-utils.h b/src/fstext/fstext-utils.h
index e06207a111e..a9038858eda 100644
--- a/src/fstext/fstext-utils.h
+++ b/src/fstext/fstext-utils.h
@@ -116,7 +116,7 @@ void MinimizeEncoded(VectorFst<Arc> *fst, float delta = kDelta) {
   Map(fst, QuantizeMapper<Arc>(delta));
   EncodeMapper<Arc> encoder(kEncodeLabels | kEncodeWeights, ENCODE);
   Encode(fst, &encoder);
-  AcceptorMinimize(fst);
+  internal::AcceptorMinimize(fst);
   Decode(fst, encoder);
 }
 
diff --git a/src/fstext/kaldi-fst-io-inl.h b/src/fstext/kaldi-fst-io-inl.h
index 9185295bee6..b6bae4b9dc9 100644
--- a/src/fstext/kaldi-fst-io-inl.h
+++ b/src/fstext/kaldi-fst-io-inl.h
@@ -42,13 +42,8 @@ void WriteFstKaldi(std::ostream &os, bool binary,
     // appear on its own line.
     os << '\n';
     bool acceptor = false, write_one = false;
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
                             NULL, acceptor, write_one, "\t");
-#else
-    FstPrinter<Arc> printer(t, t.InputSymbols(), t.OutputSymbols(),
-                            NULL, acceptor, write_one);
-#endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
       KALDI_ERR << "Stream failure detected writing FST to stream";
diff --git a/src/fstext/lattice-utils-inl.h b/src/fstext/lattice-utils-inl.h
index a3f603aa274..5bb40e3efa3 100644
--- a/src/fstext/lattice-utils-inl.h
+++ b/src/fstext/lattice-utils-inl.h
@@ -209,12 +209,12 @@ void ScaleLattice(
          !aiter.Done();
          aiter.Next()) {
       Arc arc = aiter.Value();
-      arc.weight = ScaleTupleWeight(arc.weight, scale);
+      arc.weight = Weight(ScaleTupleWeight(arc.weight, scale));
       aiter.SetValue(arc);
     }
     Weight final_weight = fst->Final(s);
     if (final_weight != Weight::Zero())
-      fst->SetFinal(s, ScaleTupleWeight(final_weight, scale));
+      fst->SetFinal(s, Weight(ScaleTupleWeight(final_weight, scale)));
   }
 }
 
@@ -267,10 +267,11 @@ void ConvertFstToLattice(
     const ExpandedFst<ArcTpl<TropicalWeight> > &ifst,
     MutableFst<ArcTpl<LatticeWeightTpl<Real> > > *ofst) {
   int32 num_states_cache = 50000;
-  CacheOptions cache_opts(true, num_states_cache);
+  fst::CacheOptions cache_opts(true, num_states_cache);
+  fst::MapFstOptions mapfst_opts(cache_opts);
   StdToLatticeMapper<Real> mapper;
   MapFst<StdArc, ArcTpl<LatticeWeightTpl<Real> >,
-         StdToLatticeMapper<Real> > map_fst(ifst, mapper, cache_opts);
+         StdToLatticeMapper<Real> > map_fst(ifst, mapper, mapfst_opts);
   *ofst = map_fst;
 }
 
diff --git a/src/fstext/lattice-utils-test.cc b/src/fstext/lattice-utils-test.cc
index dc062343298..e74caef4aa2 100644
--- a/src/fstext/lattice-utils-test.cc
+++ b/src/fstext/lattice-utils-test.cc
@@ -30,11 +30,7 @@ template<class Weight, class Int> void TestConvert(bool invert) {
     VectorFst<Arc> *fst = RandFst<Arc>();
     std::cout << "FST before converting to compact-arc is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<CompactArc> ofst;
@@ -42,25 +38,17 @@ template<class Weight, class Int> void TestConvert(bool invert) {
 
     std::cout << "FST after converting is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<CompactArc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> origfst;
     ConvertLattice<Weight, Int>(ofst, &origfst, invert);
     std::cout << "FST after back conversion is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(origfst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
-    
+
     assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
     delete fst;
   }
@@ -78,11 +66,7 @@ template<class Weight, class Int> void TestShortestPath() {
       std::cout << "Testing shortest path\n";
       std::cout << "FST before converting to compact-arc is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       VectorFst<CompactArc> cfst;
@@ -96,8 +80,8 @@ template<class Weight, class Int> void TestShortestPath() {
         ShortestPath(*fst, &nbest_fst_2, 3);
         VectorFst<Arc> nbest_fst_1b;
         ShortestPath(nbest_fst_2, &nbest_fst_1b, 1);
-      
-      
+
+
         assert(ApproxEqual(ShortestDistance(nbest_fst_1),
                            ShortestDistance(nbest_fst_1b)));
 
@@ -112,7 +96,7 @@ template<class Weight, class Int> void TestShortestPath() {
         ShortestPath(cfst, &nbest_fst_2, 3);
         VectorFst<CompactArc> nbest_fst_1b;
         ShortestPath(nbest_fst_2, &nbest_fst_1b, 1);
-      
+
         assert(ApproxEqual(ShortestDistance(nbest_fst_1),
                            ShortestDistance(nbest_fst_1b)));
         // since semiring is idempotent, this should succeed too.
@@ -122,7 +106,7 @@ template<class Weight, class Int> void TestShortestPath() {
 
       delete fst;
     }
-  }  
+  }
 }
 
 
@@ -132,7 +116,7 @@ template<class Int> void TestConvert2() {
   typedef ArcTpl<LatticeWeightTpl<double> > ArcD;
   typedef ArcTpl<CompactLatticeWeightTpl<LatticeWeightTpl<float>, Int> > CArcF;
   typedef ArcTpl<CompactLatticeWeightTpl<LatticeWeightTpl<double>, Int> > CArcD;
-  
+
   for(int i = 0; i < 2; i++) {
     {
       VectorFst<ArcF> *fst1 = RandPairFst<ArcF>();
@@ -197,7 +181,7 @@ template<class Int> void TestConvert2() {
       assert(RandEquivalent(*fst1, fst2, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
       delete fst1;
     }
-    
+
     {
       VectorFst<ArcD> *fst1 = RandPairFst<ArcD>();
       VectorFst<CArcF> cfst1;
@@ -209,7 +193,7 @@ template<class Int> void TestConvert2() {
     }
   }
 }
-    
+
 
 // use TestConvertPair when the Weight can be constructed from
 // a pair of floats.
@@ -239,7 +223,7 @@ template<class Weight, class Int> void TestConvertPair(bool invert) {
       fstprinter.Print(&std::cout, "standard output");
       }*/
 
-    assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));    
+    assert(RandEquivalent(*fst, origfst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
     delete fst;
   }
 }
@@ -268,7 +252,7 @@ template<class Weight, class Int> void TestScalePair(bool invert) {
     scale2[1][0] = -0.25;
   }
 
-  
+
   typedef ArcTpl<Weight> Arc;
   typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
   for(int i = 0; i < 2; i++) {
@@ -331,7 +315,7 @@ int main() {
   }
   {
     typedef LatticeWeightTpl<double> LatticeWeight;
-    TestShortestPath<LatticeWeight, int32>();    
+    TestShortestPath<LatticeWeight, int32>();
     TestConvert2<int32>();
     for(int i = 0; i < 2; i++) {
       bool invert = (i % 2);
diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h
index 8453b9c5670..64e10682ec9 100644
--- a/src/fstext/lattice-weight.h
+++ b/src/fstext/lattice-weight.h
@@ -320,6 +320,9 @@ template<class FloatType>
 class NaturalLess<LatticeWeightTpl<FloatType> > {
  public:
   typedef LatticeWeightTpl<FloatType> Weight;
+
+  NaturalLess() {}
+
   bool operator()(const Weight &w1, const Weight &w2) const {
     // NaturalLess is a negative order (opposite to normal ordering).
     // This operator () corresponds to "<" in the negative order, which
@@ -579,6 +582,9 @@ template<class FloatType, class IntType>
 class NaturalLess<CompactLatticeWeightTpl<LatticeWeightTpl<FloatType>, IntType> > {
  public:
   typedef CompactLatticeWeightTpl<LatticeWeightTpl<FloatType>, IntType> Weight;
+
+  NaturalLess() {}
+
   bool operator()(const Weight &w1, const Weight &w2) const {
     // NaturalLess is a negative order (opposite to normal ordering).
     // This operator () corresponds to "<" in the negative order, which
@@ -748,7 +754,7 @@ inline CompactLatticeWeightTpl<Weight, IntType> ScaleTupleWeight(
     const CompactLatticeWeightTpl<Weight, IntType> &w,
     const vector<vector<ScaleFloatType> > &scale) {
   return CompactLatticeWeightTpl<Weight, IntType>(
-      ScaleTupleWeight(w.Weight(), scale), w.String());
+      Weight(ScaleTupleWeight(w.Weight(), scale)), w.String());
 }
 
 /** Define some ConvertLatticeWeight functions that are used in various lattice
diff --git a/src/fstext/pre-determinize-test.cc b/src/fstext/pre-determinize-test.cc
index 8694267407b..bea8120e0e5 100644
--- a/src/fstext/pre-determinize-test.cc
+++ b/src/fstext/pre-determinize-test.cc
@@ -36,12 +36,12 @@ template<class Arc>  void TestPreDeterminize() {
   int n_syms = 2 + kaldi::Rand() % 5, n_states = 3 + kaldi::Rand() % 10, n_arcs = 5 + kaldi::Rand() % 30, n_final = 1 + kaldi::Rand()%3;  // Up to 2 unique symbols.
   cout << "Testing pre-determinize with "<<n_syms<<" symbols, "<<n_states<<" states and "<<n_arcs<<" arcs and "<<n_final<<" final states.\n";
   SymbolTable *sptr = NULL;
-  
+
   vector<Label> all_syms;  // including epsilon.
   // Put symbols in the symbol table from 1..n_syms-1.
   for (size_t i = 0;i < (size_t)n_syms;i++)
     all_syms.push_back(i);
-  
+
   // Create states.
   vector<StateId> all_states;
   for (size_t i = 0;i < (size_t)n_states;i++) {
@@ -69,11 +69,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -81,11 +77,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -99,11 +91,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -119,11 +107,7 @@ template<class Arc>  void TestPreDeterminize() {
 
   std::cout <<" printing after epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -133,22 +117,14 @@ template<class Arc>  void TestPreDeterminize() {
   Determinize(*fst, &ofst, opts);
   std::cout <<" printing after determinization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
   int64 num_removed = DeleteISymbols(&ofst, extra_syms);
   std::cout <<" printing after removing "<<num_removed<<" instances of extra symbols\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -200,11 +176,7 @@ template<class Arc>  void TestAddSelfLoops() {
   }
   std::cout <<" printing before adding self-loops\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -223,11 +195,7 @@ template<class Arc>  void TestAddSelfLoops() {
 
   std::cout <<" printing after adding self-loops\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, ilabels, olabels, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -247,5 +215,3 @@ int main() {
     fst::TestAddSelfLoops<fst::StdArc>();
   }
 }
-
-
diff --git a/src/fstext/prune-special-test.cc b/src/fstext/prune-special-test.cc
index e879a7593ac..2da002d980e 100644
--- a/src/fstext/prune-special-test.cc
+++ b/src/fstext/prune-special-test.cc
@@ -37,11 +37,7 @@ static void TestPruneSpecial() {
   float beam = 0.55;
 
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*ifst, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     std::cout << endl;
   }
@@ -50,11 +46,7 @@ static void TestPruneSpecial() {
   VectorFst<Arc> ofst1;
   PruneSpecial<StdArc>(*ifst, &ofst1, beam);
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst1, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     std::cout << endl;
   }
@@ -63,11 +55,7 @@ static void TestPruneSpecial() {
   VectorFst<Arc> ofst2;
   Prune(*ifst, &ofst2, beam);
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(ofst2, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
     std::cout << endl;
   }
diff --git a/src/fstext/push-special-test.cc b/src/fstext/push-special-test.cc
index 7f8ccbe92db..557b43d3062 100644
--- a/src/fstext/push-special-test.cc
+++ b/src/fstext/push-special-test.cc
@@ -37,14 +37,10 @@ static void TestPushSpecial() {
   VectorFst<Arc> *fst = RandFst<StdArc>();
 
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
-  
+
   VectorFst<Arc> fst_copy(*fst);
 
   float delta = kDelta;
@@ -59,11 +55,7 @@ static void TestPushSpecial() {
 
 
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst_copy, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   KALDI_LOG << "Min value is " << min.Value() << ", max value is " << max.Value();
@@ -71,7 +63,7 @@ static void TestPushSpecial() {
   // below, should be <= delta but different pieces of code compute this in this
   // part vs. push-special, so the roundoff may be different.
   KALDI_ASSERT(std::abs(min.Value() - max.Value()) <=  1.2 * delta);
-  
+
   KALDI_ASSERT(RandEquivalent(*fst, fst_copy,
                               5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
   delete fst;
diff --git a/src/fstext/remove-eps-local-test.cc b/src/fstext/remove-eps-local-test.cc
index 676ba82025c..af8b890cca8 100644
--- a/src/fstext/remove-eps-local-test.cc
+++ b/src/fstext/remove-eps-local-test.cc
@@ -82,11 +82,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -99,11 +95,7 @@ template<class Arc> static void TestRemoveEpsLocal() {
 
   {
     std::cout << "copy1 = \n";
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(fst_copy1, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -126,7 +118,7 @@ static void TestRemoveEpsLocalSpecial() {
   typedef LogArc::StateId StateId;
   typedef LogArc Arc;
   VectorFst<LogArc> *logfst = RandFst<LogArc>();
- 
+
   { // Make the FST stochastic.
     for (StateId s = 0; s < logfst->NumStates(); s++) {
       Weight w = logfst->Final(s);
@@ -148,11 +140,7 @@ static void TestRemoveEpsLocalSpecial() {
 #endif
   {
     std::cout << "logfst = \n";
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<LogArc> fstprinter(*logfst, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -167,11 +155,7 @@ static void TestRemoveEpsLocalSpecial() {
 
   {
     std::cout << "logfst2 = \n";
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<LogArc> fstprinter(logfst2, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   if (ApproxEqual(ShortestDistance(*logfst), ShortestDistance(logfst2))) {
@@ -192,4 +176,3 @@ int main() {
     TestRemoveEpsLocalSpecial();
   }
 }
-
diff --git a/src/fstext/table-matcher-test.cc b/src/fstext/table-matcher-test.cc
index b9e8a864454..2d39fe957dd 100644
--- a/src/fstext/table-matcher-test.cc
+++ b/src/fstext/table-matcher-test.cc
@@ -63,21 +63,13 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
 
   std::cout <<"Table-Composed FST\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(composed, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
   std::cout <<" Baseline-Composed FST\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(composed_baseline, NULL, NULL, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -86,11 +78,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     Difference(composed, composed_baseline, &diff1);
     std::cout <<" Diff1 (composed - baseline) \n";
     {
-#ifdef HAVE_OPENFST_GE_10400
-    FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
-#endif
+      FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -99,11 +87,7 @@ template<class Arc>  void TestTableMatcher(bool connect, bool left) {
     Difference(composed_baseline, composed, &diff2);
     std::cout <<" Diff2 (baseline - composed) \n";
     {
-#ifdef HAVE_OPENFST_GE_10400
-    FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
-#endif
+      FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
       fstprinter.Print(&std::cout, "standard output");
     }
 
@@ -164,11 +148,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       Difference(composed, composed_baseline, &diff1);
       std::cout <<" Diff1 (composed - baseline) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -177,11 +157,7 @@ template<class Arc>  void TestTableMatcherCacheLeft(bool connect) {
       Difference(composed_baseline, composed, &diff2);
       std::cout <<" Diff2 (baseline - composed) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -242,11 +218,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       Difference(composed, composed_baseline, &diff1);
       std::cout <<" Diff1 (composed - baseline) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff1, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -255,11 +227,7 @@ template<class Arc>  void TestTableMatcherCacheRight(bool connect) {
       Difference(composed_baseline, composed, &diff2);
       std::cout <<" Diff2 (baseline - composed) \n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(diff2, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
diff --git a/src/fstext/table-matcher.h b/src/fstext/table-matcher.h
index aed821a8725..3e704879fb9 100644
--- a/src/fstext/table-matcher.h
+++ b/src/fstext/table-matcher.h
@@ -86,7 +86,6 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual const FST &GetFst() const { return *fst_; }
 
   virtual ~TableMatcherImpl() {
-    assert(RefCount() == 0);
     vector<ArcId> *const empty = ((vector<ArcId>*)(NULL)) + 1;  // special marker.
     for (size_t i = 0; i < tables_.size(); i++) {
       if (tables_[i] != NULL && tables_[i] != empty)
@@ -219,20 +218,7 @@ class TableMatcherImpl : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return props; } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
 
-  int RefCount() const {
-    return ref_count_.count();
-  }
-
-  int IncrRefCount() {
-    return ref_count_.Incr();
-  }
-
-  int DecrRefCount() {
-    return ref_count_.Decr();
-  }
  private:
-  RefCounter ref_count_;        // Reference count
-
   virtual void SetState_(StateId s) { SetState(s); }
   virtual bool Find_(Label label) { return Find(label); }
   virtual bool Done_() const { return Done(); }
@@ -263,23 +249,22 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   typedef StateId ArcId;  // Use this type to store arc offsets [it's actually size_t
   // in the Seek function of ArcIterator, but StateId should be big enough].
   typedef typename Arc::Weight Weight;
+  typedef TableMatcherImpl<F, BackoffMatcher> Impl;
 
   TableMatcher(const FST &fst, MatchType match_type,
-               const TableMatcherOptions &opts = TableMatcherOptions()):
-      impl_(new TableMatcherImpl<F, BackoffMatcher>(fst, match_type, opts)) { }
-
-
-  TableMatcher(const TableMatcher<FST, BackoffMatcher> &matcher, bool safe):
-      impl_(matcher.impl_) {
-    impl_->IncrRefCount();
+               const TableMatcherOptions &opts = TableMatcherOptions())
+      : impl_(std::make_shared<Impl>(fst, match_type, opts)) { }
+
+  TableMatcher(const TableMatcher<FST, BackoffMatcher> &matcher,
+               bool safe = false)
+      : impl_(matcher.impl_) {
+    if (safe == true) {
+      LOG(FATAL) << "TableMatcher: Safe copy not supported";
+    }
   }
 
   virtual const FST &GetFst() const { return impl_->GetFst(); }
 
-  virtual ~TableMatcher() {
-    if (!impl_->DecrRefCount())   delete impl_;
-  }
-
   virtual MatchType Type(bool test) const { return impl_->Type(test);  }
 
   void SetState(StateId s) { return impl_->SetState(s); }
@@ -301,14 +286,15 @@ class TableMatcher : public MatcherBase<typename F::Arc> {
   virtual uint64 Properties(uint64 props) const { return impl_->Properties(props); } // simple matcher that does
   // not change its FST, so properties are properties of FST it is applied to
  private:
-  TableMatcherImpl<F, BackoffMatcher> *impl_;
+  std::shared_ptr<Impl> impl_;
 
   virtual void SetState_(StateId s) { impl_->SetState(s); }
   virtual bool Find_(Label label) { return impl_->Find(label); }
   virtual bool Done_() const { return impl_->Done(); }
   virtual const Arc& Value_() const { return impl_->Value(); }
   virtual void Next_() { impl_->Next(); }
-  DISALLOW_COPY_AND_ASSIGN(TableMatcher);
+
+  TableMatcher &operator=(const TableMatcher &) = delete;
 };
 
 struct TableComposeOptions: public TableMatcherOptions {
@@ -339,7 +325,7 @@ void TableCompose(const Fst<Arc> &ifst1, const Fst<Arc> &ifst2,
     *ofst = ComposeFst<Arc>(ifst1, ifst2, impl_opts);
   } else {
     assert(opts.table_match_type == MATCH_INPUT) ;
-    // ComposeFstImplOptions templated on matcher for fst1, matcher for fst2.    
+    // ComposeFstImplOptions templated on matcher for fst1, matcher for fst2.
     ComposeFstImplOptions<SortedMatcher<F>, TableMatcher<F> > impl_opts(nopts);
     impl_opts.matcher2 = new TableMatcher<F>(ifst2, MATCH_INPUT, opts);
     *ofst = ComposeFst<Arc>(ifst1, ifst2, impl_opts);
@@ -388,5 +374,3 @@ void TableCompose(const Fst<Arc> &ifst1, const Fst<Arc> &ifst2,
 
 } // end namespace fst
 #endif
-
-
diff --git a/src/fstext/trivial-factor-weight-test.cc b/src/fstext/trivial-factor-weight-test.cc
index af3f4a3de89..46b6aaf46fb 100644
--- a/src/fstext/trivial-factor-weight-test.cc
+++ b/src/fstext/trivial-factor-weight-test.cc
@@ -70,11 +70,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing before trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   // Trim resulting FST.
@@ -82,11 +78,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after trimming\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -97,11 +89,7 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after predeterminization\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
 
@@ -117,15 +105,11 @@ template<class Arc>  void TestFactor() {
 
   std::cout <<" printing after double-epsilon removal\n";
   {
-#ifdef HAVE_OPENFST_GE_10400
     FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true, "\t");
-#else
-    FstPrinter<Arc> fstprinter(*fst, sptr, sptr, NULL, false, true);
-#endif
     fstprinter.Print(&std::cout, "standard output");
   }
   VectorFst<Arc> ofst_star;
-  
+
   {
     printf("Converting to Gallic semiring");
     VectorFst<GallicArc<Arc> > gallic_fst;
@@ -140,58 +124,33 @@ template<class Arc>  void TestFactor() {
 
     {
       std::cout <<" printing gallic FST\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc<Arc> >  fstprinter(gallic_fst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
 
     // Map(ofst_star, &gallic_fst, ToGallicMapper<Arc, STRING_LEFT>());
-    
+
     printf("Converting gallic back to regular\n");
-#ifdef HAVE_OPENFST_GE_10400
     TrivialFactorWeightFst< GallicArc<Arc, GALLIC_LEFT>, GallicFactor<typename Arc::Label,
         typename Arc::Weight, GALLIC_LEFT> > fwfst(gallic_fst);
-#else
-    TrivialFactorWeightFst< GallicArc<Arc, STRING_LEFT>, GallicFactor<typename Arc::Label,
-        typename Arc::Weight, STRING_LEFT> > fwfst(gallic_fst);
-#endif
     {
       std::cout <<" printing factor-weight FST\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<GallicArc<Arc> >  fstprinter(fwfst, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
-#ifdef HAVE_OPENFST_GE_10400
     Map(fwfst, &ofst_star, FromGallicMapper<Arc, GALLIC_LEFT>());
-#else
-    Map(fwfst, &ofst_star, FromGallicMapper<Arc, STRING_LEFT>());
-#endif
 
     {
       std::cout <<" printing after converting back to regular FST\n";
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst_star, sptr, sptr, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
 
 
     VectorFst<GallicArc<Arc> > new_gallic_fst;
-#ifdef HAVE_OPENFST_GE_10400
     Map(ofst_star, &new_gallic_fst, ToGallicMapper<Arc, GALLIC_LEFT>());
-#else
-    Map(ofst_star, &new_gallic_fst, ToGallicMapper<Arc, STRING_LEFT>());
-#endif
 
     assert(RandEquivalent(gallic_fst, new_gallic_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length-- max?*/));
 
@@ -251,5 +210,3 @@ int main() {
     fst::TestFactor<fst::StdArc>();
   }
 }
-
-
diff --git a/src/fstext/trivial-factor-weight.h b/src/fstext/trivial-factor-weight.h
index 109ba75ce10..43d72729c08 100644
--- a/src/fstext/trivial-factor-weight.h
+++ b/src/fstext/trivial-factor-weight.h
@@ -52,17 +52,8 @@
 // This has the advantage that it always works, for any input (also I just
 // prefer this approach).
 
-#ifdef _MSC_VER
 #include <unordered_map>
 using std::unordered_map;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-using std::unordered_map;
-#else
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-#endif
-
 
 #include <algorithm>
 #include <string>
@@ -94,6 +85,7 @@ struct TrivialFactorWeightOptions : CacheOptions {
 
 };
 
+namespace internal {
 
 // Implementation class for TrivialFactorWeight
 template <class A, class F>
@@ -117,10 +109,8 @@ class TrivialFactorWeightFstImpl
   typedef typename A::StateId StateId;
   typedef F FactorIterator;
 
-#ifdef HAVE_OPENFST_GE_10400
   typedef DefaultCacheStore<A> Store;
   typedef typename Store::State State;
-#endif
 
   struct Element {
     Element() {}
@@ -157,10 +147,6 @@ class TrivialFactorWeightFstImpl
     SetOutputSymbols(impl.OutputSymbols());
   }
 
-  ~TrivialFactorWeightFstImpl() {
-    delete fst_;
-  }
-
   StateId Start() {
     if (!HasStart()) {
       StateId s = fst_->Start();
@@ -307,7 +293,7 @@ class TrivialFactorWeightFstImpl
 
   typedef unordered_map<Element, StateId, ElementKey, ElementEqual> ElementMap;
 
-  const Fst<A> *fst_;
+  std::unique_ptr<const Fst<A>> fst_;
   float delta_;
   uint32 mode_;               // factoring arc and/or final weights
   Label extra_ilabel_;        // ilabel of arc created when factoring final w's
@@ -315,11 +301,11 @@ class TrivialFactorWeightFstImpl
   vector<Element> elements_;  // mapping Fst state to Elements
   ElementMap element_map_;    // mapping Elements to Fst state
 
-  void operator = (const TrivialFactorWeightFstImpl<A, F> &);  // disallow
 };
 
+}  // namespace internal
 
-/// FactorWeightFst takes as template parameter a FactorIterator as
+/// TrivialFactorWeightFst takes as template parameter a FactorIterator as
 /// defined above. The result of weight factoring is a transducer
 /// equivalent to the input whose path weights have been factored
 /// according to the FactorIterator. States and transitions will be
@@ -336,7 +322,8 @@ class TrivialFactorWeightFstImpl
 
 
 template <class A, class F>
-class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F> > {
+class TrivialFactorWeightFst :
+    public ImplToFst<internal::TrivialFactorWeightFstImpl<A, F>> {
  public:
   friend class ArcIterator< TrivialFactorWeightFst<A, F> >;
   friend class StateIterator< TrivialFactorWeightFst<A, F> >;
@@ -344,40 +331,36 @@ class TrivialFactorWeightFst : public ImplToFst< TrivialFactorWeightFstImpl<A, F
   typedef A Arc;
   typedef typename A::Weight Weight;
   typedef typename A::StateId StateId;
-#ifdef HAVE_OPENFST_GE_10400
   typedef DefaultCacheStore<Arc> Store;
   typedef typename Store::State State;
-#else
-  typedef CacheState<A> State;
-#endif
-  typedef TrivialFactorWeightFstImpl<A, F> Impl;
+  typedef internal::TrivialFactorWeightFstImpl<A, F> Impl;
 
-  TrivialFactorWeightFst(const Fst<A> &fst)
-      : ImplToFst<Impl>(new Impl(fst, TrivialFactorWeightOptions<A>())) {}
+  explicit TrivialFactorWeightFst(const Fst<A> &fst)
+      : ImplToFst<Impl>(std::make_shared<Impl>(fst, TrivialFactorWeightOptions<A>())) {}
 
   TrivialFactorWeightFst(const Fst<A> &fst,  const TrivialFactorWeightOptions<A> &opts)
-      : ImplToFst<Impl>(new Impl(fst, opts)) {}
+      : ImplToFst<Impl>(std::make_shared<Impl>(fst, opts)) {}
 
   // See Fst<>::Copy() for doc.
   TrivialFactorWeightFst(const TrivialFactorWeightFst<A, F> &fst, bool copy)
       : ImplToFst<Impl>(fst, copy) {}
 
   // Get a copy of this TrivialFactorWeightFst. See Fst<>::Copy() for further doc.
-  virtual TrivialFactorWeightFst<A, F> *Copy(bool copy = false) const {
+  TrivialFactorWeightFst<A, F> *Copy(bool copy = false) const override {
     return new TrivialFactorWeightFst<A, F>(*this, copy);
   }
 
-  virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
+  inline void InitStateIterator(StateIteratorData<A> *data) const override;
 
-  virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
-    GetImpl()->InitArcIterator(s, data);
+  void InitArcIterator(StateId s, ArcIteratorData<A> *data) const override {
+    GetMutableImpl()->InitArcIterator(s, data);
   }
 
  private:
-  // Makes visible to friends.
-  Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
+  using ImplToFst<Impl>::GetImpl;
+  using ImplToFst<Impl>::GetMutableImpl;
 
-  void operator=(const TrivialFactorWeightFst<A, F> &fst);  // Disallow
+  TrivialFactorWeightFst &operator=(const TrivialFactorWeightFst &fst) = delete;
 };
 
 
@@ -387,7 +370,7 @@ class StateIterator< TrivialFactorWeightFst<A, F> >
     : public CacheStateIterator< TrivialFactorWeightFst<A, F> > {
  public:
   explicit StateIterator(const TrivialFactorWeightFst<A, F> &fst)
-      : CacheStateIterator< TrivialFactorWeightFst<A, F> >(fst, fst.GetImpl()) {}
+      : CacheStateIterator< TrivialFactorWeightFst<A, F> >(fst, fst.GetMutableImpl()) {}
 };
 
 
@@ -399,18 +382,14 @@ class ArcIterator< TrivialFactorWeightFst<A, F> >
   typedef typename A::StateId StateId;
 
   ArcIterator(const TrivialFactorWeightFst<A, F> &fst, StateId s)
-      : CacheArcIterator< TrivialFactorWeightFst<A, F> >(fst.GetImpl(), s) {
-    if (!fst.GetImpl()->HasArcs(s))
-      fst.GetImpl()->Expand(s);
+      : CacheArcIterator< TrivialFactorWeightFst<A, F>>(fst.GetMutableImpl(), s) {
+    if (!fst.GetImpl()->HasArcs(s)) fst.GetMutableImpl()->Expand(s);
   }
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(ArcIterator);
 };
 
-template <class A, class F> inline
-void TrivialFactorWeightFst<A, F>::InitStateIterator(StateIteratorData<A> *data) const
-{
+template <class A, class F>
+inline void TrivialFactorWeightFst<A, F>::InitStateIterator(
+    StateIteratorData<A> *data) const {
   data->base = new StateIterator< TrivialFactorWeightFst<A, F> >(*this);
 }
 
@@ -420,4 +399,3 @@ void TrivialFactorWeightFst<A, F>::InitStateIterator(StateIteratorData<A> *data)
 }  // namespace fst
 
 #endif
-
diff --git a/src/gmm/am-diag-gmm-test.cc b/src/gmm/am-diag-gmm-test.cc
index 54ca3c153ce..d40ef3df2e4 100644
--- a/src/gmm/am-diag-gmm-test.cc
+++ b/src/gmm/am-diag-gmm-test.cc
@@ -66,7 +66,7 @@ void TestAmDiagGmmIO(const AmDiagGmm &am_gmm) {
     loglike2 += am_gmm2->LogLikelihood(i, feat);
   kaldi::AssertEqual(loglike, loglike2, 1e-4);
   delete am_gmm2;
-  
+
   unlink("tmpf");
   unlink("tmpfb");
 }
@@ -122,7 +122,7 @@ void UnitTestAmDiagGmm() {
 }
 
 int main() {
-  for (int i = 0; i < 10; i++)
+  for (int i = 0; i < 5; i++)
     UnitTestAmDiagGmm();
   std::cout << "Test OK.\n";
   return 0;
diff --git a/src/gmm/full-gmm.cc b/src/gmm/full-gmm.cc
index 7851d8648f7..0f634eeee6b 100644
--- a/src/gmm/full-gmm.cc
+++ b/src/gmm/full-gmm.cc
@@ -113,7 +113,7 @@ int32 FullGmm::ComputeGconsts() {
     // So gc is the likelihood at zero feature value.
 
     if (KALDI_ISNAN(gc)) {  // negative infinity is OK but NaN is not acceptable
-      KALDI_ERR << "At component" << mix
+      KALDI_ERR << "At component " << mix
                 << ", not a number in gconst computation";
     }
     if (KALDI_ISINF(gc)) {
@@ -687,7 +687,7 @@ BaseFloat FullGmm::GaussianSelectionPreselect(
   }
   Vector<BaseFloat> loglikes(preselect_sz);
   LogLikelihoodsPreselect(data, preselect, &loglikes);
-  
+
   Vector<BaseFloat> loglikes_copy(loglikes);
   BaseFloat *ptr = loglikes_copy.Data();
   std::nth_element(ptr, ptr+preselect_sz-this_num_gselect,
diff --git a/src/gmmbin/gmm-adapt-map.cc b/src/gmmbin/gmm-adapt-map.cc
index bc0bac9f6cc..ec3eb8cea9b 100644
--- a/src/gmmbin/gmm-adapt-map.cc
+++ b/src/gmmbin/gmm-adapt-map.cc
@@ -40,20 +40,20 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: gmm-adapt-map  [options] <model-in> <feature-rspecifier> "
         "<posteriors-rspecifier> <map-am-wspecifier>\n";
-    
+
     ParseOptions po(usage);
-    string spk2utt_rspecifier;
+    std::string spk2utt_rspecifier;
     bool binary = true;
     MapDiagGmmOptions map_config;
     std::string update_flags_str = "mw";
-        
+
     po.Register("spk2utt", &spk2utt_rspecifier, "rspecifier for speaker to "
                 "utterance-list map");
     po.Register("binary", &binary, "Write output in binary mode");
     po.Register("update-flags", &update_flags_str, "Which GMM parameters will be "
                 "updated: subset of mvw.");
     map_config.Register(&po);
-        
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 4) {
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
         map_am_wspecifier = po.GetArg(4);
 
     GmmFlagsType update_flags = StringToGmmFlags(update_flags_str);
-    
+
     RandomAccessPosteriorReader posteriors_reader(posteriors_rspecifier);
     MapAmDiagGmmWriter map_am_writer(map_am_wspecifier);
 
@@ -83,7 +83,7 @@ int main(int argc, char *argv[]) {
     double tot_like = 0.0, tot_like_change = 0.0, tot_t = 0.0,
         tot_t_check = 0.0;
     int32 num_done = 0, num_err = 0;
-    
+
     if (spk2utt_rspecifier != "") {  // per-speaker adaptation
       SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
       RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
@@ -93,9 +93,9 @@ int main(int argc, char *argv[]) {
         copy_am_gmm.CopyFromAmDiagGmm(am_gmm);
         AccumAmDiagGmm map_accs;
         map_accs.Init(am_gmm, update_flags);
-        
+
         const std::vector<std::string> &uttlist = spk2utt_reader.Value();
-        
+
         // for each speaker, estimate MAP means
         std::vector<std::string>::const_iterator iter = uttlist.begin(),
             end = uttlist.end();
@@ -124,8 +124,8 @@ int main(int argc, char *argv[]) {
           ConvertPosteriorToPdfs(trans_model, posterior, &pdf_posterior);
           for ( size_t i = 0; i < posterior.size(); i++ ) {
             for ( size_t j = 0; j < pdf_posterior[i].size(); j++ ) {
-              int32 pdf_id = pdf_posterior[i][j].first; 
-              BaseFloat weight = pdf_posterior[i][j].second; 
+              int32 pdf_id = pdf_posterior[i][j].first;
+              BaseFloat weight = pdf_posterior[i][j].second;
               file_like += map_accs.AccumulateForGmm(copy_am_gmm,
                                                      feats.Row(i),
                                                      pdf_id, weight);
@@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
 
           KALDI_VLOG(2) << "Average like for utterance " << utt << " is "
                         << (file_like/file_t) << " over " << file_t << " frames.";
-          
+
           tot_like += file_like;
           tot_t += file_t;
           num_done++;
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
             KALDI_VLOG(1) << "Avg like per frame so far is "
                           << (tot_like / tot_t);
         }  // end looping over all utterances of the current speaker
- 
+
         // MAP estimation.
         BaseFloat spk_objf_change = 0.0, spk_frames = 0.0;
         MapAmDiagGmmUpdate(map_config, map_accs, update_flags, &copy_am_gmm,
@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) {
                   << " over " << spk_frames << " frames.";
         tot_like_change += spk_objf_change;
         tot_t_check += spk_frames;
-        
+
         // Writing AM for each speaker in a table
         map_am_writer.Write(spk,copy_am_gmm);
       }  // end looping over speakers
@@ -201,9 +201,9 @@ int main(int argc, char *argv[]) {
         tot_like += file_like;
         tot_t += file_t;
         if ( num_done % 10 == 0 )
-          KALDI_VLOG(1) << "Avg like per frame so far is " 
+          KALDI_VLOG(1) << "Avg like per frame so far is "
                         << (tot_like / tot_t);
-                
+
         // MAP
         BaseFloat utt_objf_change = 0.0, utt_frames = 0.0;
         MapAmDiagGmmUpdate(map_config, map_accs, update_flags, &copy_am_gmm,
@@ -213,7 +213,7 @@ int main(int argc, char *argv[]) {
                   << " over " << utt_frames << " frames.";
         tot_like_change += utt_objf_change;
         tot_t_check += utt_frames;
-        
+
         // Writing AM for each utterance in a table
         map_am_writer.Write(feature_reader.Key(), copy_am_gmm);
       }
diff --git a/src/gmmbin/gmm-align-compiled.cc b/src/gmmbin/gmm-align-compiled.cc
index 85ac3fd27a7..c3aadcc7ec9 100644
--- a/src/gmmbin/gmm-align-compiled.cc
+++ b/src/gmmbin/gmm-align-compiled.cc
@@ -44,8 +44,8 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         " gmm-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
-        "   gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "   ark:- | gmm-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
 
     ParseOptions po(usage);
     AlignConfig align_config;
diff --git a/src/gmmbin/gmm-align.cc b/src/gmmbin/gmm-align.cc
index 7ef5f9c8dab..c9c2fde11f6 100644
--- a/src/gmmbin/gmm-align.cc
+++ b/src/gmmbin/gmm-align.cc
@@ -39,9 +39,11 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Align features given [GMM-based] models.\n"
-        "Usage:   gmm-align [options] tree-in model-in lexicon-fst-in feature-rspecifier transcriptions-rspecifier alignments-wspecifier\n"
+        "Usage:   gmm-align [options] tree-in model-in lexicon-fst-in feature-rspecifier "
+        "transcriptions-rspecifier alignments-wspecifier\n"
         "e.g.: \n"
-        " gmm-align tree 1.mdl lex.fst scp:train.scp ark:train.tra ark:1.ali\n";
+        " gmm-align tree 1.mdl lex.fst scp:train.scp "
+        "'ark:sym2int.pl -f 2- words.txt text|' ark:1.ali\n";
     ParseOptions po(usage);
     AlignConfig align_config;
     BaseFloat acoustic_scale = 1.0;
diff --git a/src/gst-plugin/gst-online-gmm-decode-faster.cc b/src/gst-plugin/gst-online-gmm-decode-faster.cc
index 040a04b53c2..958bce41d80 100644
--- a/src/gst-plugin/gst-online-gmm-decode-faster.cc
+++ b/src/gst-plugin/gst-online-gmm-decode-faster.cc
@@ -287,7 +287,7 @@ gst_online_gmm_decode_faster_init(GstOnlineGmmDecodeFaster * filter) {
   std::vector<std::pair<std::string, SimpleOptions::OptionInfo> > option_info_list;
   option_info_list = filter->simple_options_->GetOptionInfoList();
   int32 i = 0;
-  for (vector<std::pair<std::string,
+  for (std::vector<std::pair<std::string,
       SimpleOptions::OptionInfo> >::iterator dx = option_info_list.begin();
       dx != option_info_list.end(); dx++) {
     std::pair<std::string, SimpleOptions::OptionInfo> result = (*dx);
@@ -747,7 +747,7 @@ gst_online_gmm_decode_faster_loop(GstOnlineGmmDecodeFaster * filter) {
       std::vector<int32> word_ids;
       filter->decoder_->FinishTraceBack(filter->out_fst_);
       fst::GetLinearSymbolSequence(*(filter->out_fst_),
-                                   static_cast<vector<int32> *>(0),
+                                   static_cast<std::vector<int32> *>(0),
                                    &word_ids,
                                    static_cast<LatticeArc::Weight*>(0));
       gst_online_gmm_decode_faster_push_words(filter, filter->srcpad_, word_ids, filter->word_syms_, partial_res || word_ids.size());
@@ -758,7 +758,7 @@ gst_online_gmm_decode_faster_loop(GstOnlineGmmDecodeFaster * filter) {
       std::vector<int32> word_ids;
       if (filter->decoder_->PartialTraceback(filter->out_fst_)) {
         fst::GetLinearSymbolSequence(*(filter->out_fst_),
-                                     static_cast<vector<int32> *>(0),
+                                     static_cast<std::vector<int32> *>(0),
                                      &word_ids,
                                      static_cast<LatticeArc::Weight*>(0));
         gst_online_gmm_decode_faster_push_words(filter, filter->srcpad_, word_ids, filter->word_syms_, false);
diff --git a/src/hmm/hmm-utils-test.cc b/src/hmm/hmm-utils-test.cc
index 805b77ce7f0..69728cc8ca7 100644
--- a/src/hmm/hmm-utils-test.cc
+++ b/src/hmm/hmm-utils-test.cc
@@ -292,8 +292,8 @@ void TestConvertAlignment() {
   std::vector<int32> new_alignment;
 
   bool ans = ConvertAlignment(trans_model_old, trans_model_new, *ctx_dep_new,
-                              old_alignment, subsample_factor, new_reorder,
-                              NULL, &new_alignment);
+                              old_alignment, subsample_factor, false,
+                              new_reorder, NULL, &new_alignment);
   if(!ans) {
     KALDI_WARN << "Alignment conversion failed";
     // make sure it failed for a good reason.
@@ -311,8 +311,8 @@ void TestConvertAlignment() {
       // we should be able to convert back and it'll be the same.
       std::vector<int32> old_alignment_copy;
       bool ans = ConvertAlignment(trans_model_new, trans_model_old, *ctx_dep_old,
-                                  new_alignment, subsample_factor, old_reorder,
-                                  NULL, &old_alignment_copy);
+                                  new_alignment, subsample_factor, false,
+                                  old_reorder, NULL, &old_alignment_copy);
       KALDI_ASSERT(ans);
       KALDI_ASSERT(old_alignment_copy == old_alignment);
     }
diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc
index ab0b133f708..fe6c5b32d6e 100644
--- a/src/hmm/hmm-utils.cc
+++ b/src/hmm/hmm-utils.cc
@@ -772,18 +772,38 @@ static inline void ConvertAlignmentForPhone(
 }
 
 
+
 /**
-   This function, called from ConvertAlignment(), works out suitable new lengths
-   of phones in the case where subsample_factor != 1.  The input vectors
+   This function, called from ConvertAlignmentInternal(), works out suitable new
+   lengths of phones in the case where subsample_factor != 1.  The input vectors
    'mapped_phones' and 'old_lengths' must be the same size-- the length of the
    phone sequence.  The 'topology' object and 'mapped_phones' are needed to
    work out the minimum length of each phone in the sequence.
-   Returns true only if it could not assign lengths (because the topology was
+   Returns false only if it could not assign lengths (because the topology was
    too long relative to the number of frames).
+
+   @param topology [in]         The new phone lengths are computed with
+                                regard to this topology
+   @param mapped_phones [in]    The phones for which this function computes
+                                new lengths
+   @param old_lengths     [in]  The old lengths
+   @param conversion_shift [in] This will normally equal subsample_factor - 1
+                                but may be less than that if the 'repeat_frames'
+                                option is true; it's used for generating
+                                'frame-shifted' versions of alignments that
+                                we will later interpolate. This helps us keep
+                                the phone boundaries of the subsampled and
+                                interpolated alignments the same as
+                                the original alignment.
+   @param subsample_factor [in] The frame subsampling factor... normally 1, but
+                                might be > 1 if we're converting to a
+                                reduced-frame-rate system.
+   @param new_lengths [out]     The vector for storing new lengths.
 */
 static bool ComputeNewPhoneLengths(const HmmTopology &topology,
                                    const std::vector<int32> &mapped_phones,
                                    const std::vector<int32> &old_lengths,
+                                   int32 conversion_shift,
                                    int32 subsample_factor,
                                    std::vector<int32> *new_lengths) {
   int32 phone_sequence_length = old_lengths.size();
@@ -797,10 +817,10 @@ static bool ComputeNewPhoneLengths(const HmmTopology &topology,
     // the subsampled alignments have the same length as features
     // subsampled with 'subsample-feats'.
     int32 subsampled_time =
-        (cur_time_elapsed + subsample_factor - 1) / subsample_factor;
+        (cur_time_elapsed + conversion_shift) / subsample_factor;
     cur_time_elapsed += old_lengths[i];
     int32 next_subsampled_time =
-        (cur_time_elapsed + subsample_factor - 1) / subsample_factor;
+        (cur_time_elapsed + conversion_shift) / subsample_factor;
     (*new_lengths)[i] = next_subsampled_time - subsampled_time;
   }
   bool changed = true;
@@ -850,14 +870,23 @@ static bool ComputeNewPhoneLengths(const HmmTopology &topology,
   return true;
 }
 
-bool ConvertAlignment(const TransitionModel &old_trans_model,
+/**
+  This function is the same as 'ConvertAligment',
+  but instead of the 'repeat_frames' option it supports the 'conversion_shift'
+  option; see the documentation of ComputeNewPhoneLengths() for what
+  'conversion_shift' is for.
+*/
+
+static bool ConvertAlignmentInternal(const TransitionModel &old_trans_model,
                       const TransitionModel &new_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
+                      int32 conversion_shift,
                       int32 subsample_factor,
                       bool new_is_reordered,
                       const std::vector<int32> *phone_map,
                       std::vector<int32> *new_alignment) {
+  KALDI_ASSERT(0 <= conversion_shift && conversion_shift < subsample_factor);
   bool old_is_reordered = IsReordered(old_trans_model, old_alignment);
   KALDI_ASSERT(new_alignment != NULL);
   new_alignment->clear();
@@ -893,7 +922,7 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
     for (int32 i = 0; i < phone_sequence_length; i++)
       old_lengths[i] = old_split[i].size();
     if (!ComputeNewPhoneLengths(new_trans_model.GetTopo(),
-                                mapped_phones, old_lengths,
+                                mapped_phones, old_lengths, conversion_shift,
                                 subsample_factor, &new_lengths)) {
       KALDI_WARN << "Failed to produce suitable phone lengths";
       return false;
@@ -931,7 +960,58 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
     }
   }
   KALDI_ASSERT(new_alignment->size() ==
-               (old_alignment.size() + subsample_factor - 1)/subsample_factor);
+               (old_alignment.size() + conversion_shift)/subsample_factor);
+  return true;
+}
+
+bool ConvertAlignment(const TransitionModel &old_trans_model,
+                      const TransitionModel &new_trans_model,
+                      const ContextDependencyInterface &new_ctx_dep,
+                      const std::vector<int32> &old_alignment,
+                      int32 subsample_factor,
+                      bool repeat_frames,
+                      bool new_is_reordered,
+                      const std::vector<int32> *phone_map,
+                      std::vector<int32> *new_alignment) {
+  if (!repeat_frames || subsample_factor == 1) {
+    return ConvertAlignmentInternal(old_trans_model,
+                                    new_trans_model,
+                                    new_ctx_dep,
+                                    old_alignment,
+                                    subsample_factor - 1,
+                                    subsample_factor,
+                                    new_is_reordered,
+                                    phone_map,
+                                    new_alignment);
+   // The value "subsample_factor - 1" for conversion_shift above ensures the
+   // alignments have the same length as the output of 'subsample-feats'
+  } else {
+    std::vector<std::vector<int32> > shifted_alignments(subsample_factor);
+    for (int32 conversion_shift = subsample_factor - 1;
+         conversion_shift >= 0; conversion_shift--) {
+      if (!ConvertAlignmentInternal(old_trans_model,
+                                    new_trans_model,
+                                    new_ctx_dep,
+                                    old_alignment,
+                                    conversion_shift,
+                                    subsample_factor,
+                                    new_is_reordered,
+                                    phone_map,
+                                    &shifted_alignments[conversion_shift]))
+        return false;
+    }
+    KALDI_ASSERT(new_alignment != NULL);
+    new_alignment->clear();
+    new_alignment->reserve(old_alignment.size());
+    int32 max_shifted_ali_length = (old_alignment.size() / subsample_factor)
+                                   + (old_alignment.size() % subsample_factor);
+    for (int32 i = 0; i < max_shifted_ali_length; i++)
+      for (int32 conversion_shift = subsample_factor - 1;
+           conversion_shift >= 0; conversion_shift--)
+        if (i < static_cast<int32>(shifted_alignments[conversion_shift].size()))
+          new_alignment->push_back(shifted_alignments[conversion_shift][i]);
+  }
+  KALDI_ASSERT(new_alignment->size() == old_alignment.size());
   return true;
 }
 
diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h
index 1af62c646be..3d51cbe1f14 100644
--- a/src/hmm/hmm-utils.h
+++ b/src/hmm/hmm-utils.h
@@ -245,6 +245,15 @@ bool SplitToPhones(const TransitionModel &trans_model,
    @param subsample_factor [in] The frame subsampling factor... normally 1, but
                                 might be > 1 if we're converting to a reduced-frame-rate
                                 system.
+   @param repeat_frames [in]    Only relevant when subsample_factor != 1
+                                If true, repeat frames of alignment by
+                                'subsample_factor' after alignment
+                                conversion, to keep the alignment the same
+                                length as the input alignment.
+                                [note: we actually do this by interpolating
+                                'subsample_factor' separately generated
+                                alignments, to keep the phone boundaries
+                                the same as the input where possible.]
    @param reorder [in]          True if you want the pdf-ids on the new alignment to
                                 be 'reordered'. (vs. the way they appear in
                                 the HmmTopology object)
@@ -257,6 +266,7 @@ bool ConvertAlignment(const TransitionModel &old_trans_model,
                       const ContextDependencyInterface &new_ctx_dep,
                       const std::vector<int32> &old_alignment,
                       int32 subsample_factor,  // 1 in the normal case -> no subsampling.
+                      bool repeat_frames,
                       bool reorder,
                       const std::vector<int32> *phone_map,  // may be NULL
                       std::vector<int32> *new_alignment);
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index 33a0d55443e..442de8fd2e0 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -317,7 +317,7 @@ class TransitionModel {
   int32 num_pdfs_;
 
 
-  DISALLOW_COPY_AND_ASSIGN(TransitionModel);
+  KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
 
 };
 
diff --git a/src/itf/decodable-itf.h b/src/itf/decodable-itf.h
index ad3b7809dab..9852861969d 100644
--- a/src/itf/decodable-itf.h
+++ b/src/itf/decodable-itf.h
@@ -112,7 +112,7 @@ class DecodableInterface {
 
   /// Returns the number of states in the acoustic model
   /// (they will be indexed one-based, i.e. from 1 to NumIndices();
-  /// this is for compatibility with OpenFst.
+  /// this is for compatibility with OpenFst).
   virtual int32 NumIndices() const = 0;
 
   virtual ~DecodableInterface() {}
diff --git a/src/ivectorbin/ivector-mean.cc b/src/ivectorbin/ivector-mean.cc
index 9db070d61ab..6e6117c1eb7 100644
--- a/src/ivectorbin/ivector-mean.cc
+++ b/src/ivectorbin/ivector-mean.cc
@@ -42,14 +42,14 @@ int main(int argc, char *argv[]) {
         "e.g.: ivector-mean data/spk2utt exp/ivectors.ark exp/spk_ivectors.ark exp/spk_num_utts.ark\n"
         "or: ivector-mean exp/ivectors.ark exp/mean.vec\n"
         "See also: ivector-subtract-global-mean\n";
-    
+
     ParseOptions po(usage);
     bool binary_write = false;
     po.Register("binary", &binary_write, "If true, write output in binary "
                 "(only applicable when writing files, not archives/tables.");
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() < 2 || po.NumArgs() > 4) {
       po.PrintUsage();
       exit(1);
@@ -79,10 +79,10 @@ int main(int argc, char *argv[]) {
           ivector_rspecifier = po.GetArg(2),
           ivector_wspecifier = po.GetArg(3),
           num_utts_wspecifier = po.GetOptArg(4);
-    
+
       double spk_sumsq = 0.0;
       Vector<double> spk_sum;
-    
+
       int64 num_spk_done = 0, num_spk_err = 0,
           num_utt_done = 0, num_utt_err = 0;
 
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
       SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
       BaseFloatVectorWriter ivector_writer(ivector_wspecifier);
       Int32Writer num_utts_writer(num_utts_wspecifier);
-    
+
       for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
         std::string spk = spk2utt_reader.Key();
         const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -130,7 +130,7 @@ int main(int argc, char *argv[]) {
           spk_sum.AddVec(1.0, spk_mean);
         }
       }
-      
+
       KALDI_LOG << "Computed mean of " << num_spk_done << " speakers ("
                 << num_spk_err << " with no utterances), consisting of "
                 << num_utt_done << " utterances (" << num_utt_err
diff --git a/src/kws/kws-functions.cc b/src/kws/kws-functions.cc
index 8cb82c7bb0f..d1d71ce7a42 100644
--- a/src/kws/kws-functions.cc
+++ b/src/kws/kws-functions.cc
@@ -38,12 +38,12 @@ bool CompareInterval(const Interval &i1,
 }
 
 bool ClusterLattice(CompactLattice *clat,
-                    const vector<int32> &state_times) {
+                    const std::vector<int32> &state_times) {
   using namespace fst;
   typedef CompactLattice::StateId StateId;
 
   // Hashmap to store the cluster heads.
-  unordered_map<StateId, vector<Interval> > head;
+  unordered_map<StateId, std::vector<Interval> > head;
 
   // Step 1: Iterate over the lattice to get the arcs
   StateId max_id = 0;
@@ -72,11 +72,11 @@ bool ClusterLattice(CompactLattice *clat,
   //   the cluster heads is to take the first one as a cluster head; then go
   //   till we find the next one that doesn't overlap in time with the current
   //   cluster head, and so on.
-  unordered_map<StateId, vector<Interval> >::iterator iter;
+  unordered_map<StateId, std::vector<Interval> >::iterator iter;
   for (iter = head.begin(); iter != head.end(); ++iter) {
     // For this ilabel, sort all the arcs on time, from first to last.
     sort(iter->second.begin(), iter->second.end(), CompareInterval);
-    vector<Interval> tmp;
+    std::vector<Interval> tmp;
     tmp.push_back(iter->second[0]);
     for (int32 i = 1; i < iter->second.size(); i++) {
       if (tmp.back().End() <= iter->second[i].Start())
@@ -158,7 +158,7 @@ class CompactLatticeToKwsProductFstMapper {
 
 
 bool CreateFactorTransducer(const CompactLattice &clat,
-                            const vector<int32> &state_times,
+                            const std::vector<int32> &state_times,
                             int32 utterance_id,
                             KwsProductFst *factor_transducer) {
   using namespace fst;
@@ -166,8 +166,8 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 
   // We first compute the alphas and betas
   bool success = false;
-  vector<double> alpha;
-  vector<double> beta;
+  std::vector<double> alpha;
+  std::vector<double> beta;
   success = ComputeCompactLatticeAlphas(clat, &alpha);
   success = success && ComputeCompactLatticeBetas(clat, &beta);
   if (!success)
@@ -263,7 +263,7 @@ bool CreateFactorTransducer(const CompactLattice &clat,
 }
 
 void RemoveLongSilences(int32 max_silence_frames,
-                        const vector<int32> &state_times,
+                        const std::vector<int32> &state_times,
                         KwsProductFst *factor_transducer) {
   using namespace fst;
   typedef KwsProductArc::StateId StateId;
diff --git a/src/kwsbin/kws-index-union.cc b/src/kwsbin/kws-index-union.cc
index 4a0f3ccea1d..cd82edee6bc 100644
--- a/src/kwsbin/kws-index-union.cc
+++ b/src/kwsbin/kws-index-union.cc
@@ -92,7 +92,7 @@ int main(int argc, char *argv[]) {
                    << " (should affect speed of search but not results)";
         global_index = ifst;
       }
-      Minimize(&global_index);
+      Minimize(&global_index, static_cast<KwsLexicographicFst*>(NULL), kDelta, true);
       Decode(&global_index, encoder);
     } else {
       KALDI_LOG << "Skipping index optimization...";
diff --git a/src/kwsbin/kws-search.cc b/src/kwsbin/kws-search.cc
index 836f7b9a111..1ef2655c656 100644
--- a/src/kwsbin/kws-search.cc
+++ b/src/kwsbin/kws-search.cc
@@ -316,7 +316,7 @@ int main(int argc, char *argv[]) {
       }
 
       Project(&result_fst, PROJECT_OUTPUT);
-      Minimize(&result_fst);
+      Minimize(&result_fst, (KwsLexicographicFst *) nullptr, kDelta, true);
       ShortestPath(result_fst, &result_fst, n_best);
       RmEpsilon(&result_fst);
 
diff --git a/src/kwsbin/transcripts-to-fsts.cc b/src/kwsbin/transcripts-to-fsts.cc
index 4e7787f2642..ecf76edd757 100644
--- a/src/kwsbin/transcripts-to-fsts.cc
+++ b/src/kwsbin/transcripts-to-fsts.cc
@@ -123,7 +123,7 @@ int main(int argc, char *argv[]) {
 
       VectorFst<StdArc> fst;
       MakeLinearAcceptor(transcript, &fst);
-      if (costs_reader.HasKey(key)) {
+      if (costs_reader.IsOpen() && costs_reader.HasKey(key)) {
         double cost = costs_reader.Value(key);
         SetLinearAcceptorWeight(cost, &fst);
       }
diff --git a/src/lat/arctic-weight.h b/src/lat/arctic-weight.h
index 7806cec96d1..2b308f44e65 100644
--- a/src/lat/arctic-weight.h
+++ b/src/lat/arctic-weight.h
@@ -27,8 +27,8 @@ namespace fst {
 
 // Arctic semiring: (max, +, inf, 0)
 // We define the Arctic semiring T' = (R \cup {-inf, +inf}, max, +, -inf, 0).
-// The term "Arctic" came from Keith Kintzley (kintzley@jhu.edu), as opposite 
-// to the Tropical semiring. 
+// The term "Arctic" came from Keith Kintzley (kintzley@jhu.edu), as opposite
+// to the Tropical semiring.
 template <class T>
 class ArcticWeightTpl : public FloatWeightTpl<T> {
  public:
@@ -49,7 +49,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
     return ArcticWeightTpl<T>(0.0F); }
 
   static const string &Type() {
-    static const string type = "arctic" +
+    static const string type = string("arctic") +
         FloatWeightTpl<T>::GetPrecisionString();
     return type;
   }
@@ -57,7 +57,7 @@ class ArcticWeightTpl : public FloatWeightTpl<T> {
   static ArcticWeightTpl<T> NoWeight() {
     return ArcticWeightTpl<T>(numeric_limits<T>::infinity());
   }
-  
+
   bool Member() const {
     // First part fails for IEEE NaN
     return Value() == Value() && Value() != numeric_limits<T>::infinity();
diff --git a/src/lat/determinize-lattice-pruned-test.cc b/src/lat/determinize-lattice-pruned-test.cc
index d5f22454017..f6684f0b5b5 100644
--- a/src/lat/determinize-lattice-pruned-test.cc
+++ b/src/lat/determinize-lattice-pruned-test.cc
@@ -37,7 +37,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
   typedef kaldi::int32 Int;
   typedef typename Arc::Weight Weight;
   typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
-  
+
   for(int i = 0; i < 100; i++) {
     RandFstOptions opts;
     opts.n_states = 4;
@@ -47,10 +47,10 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     opts.weight_multiplier = 0.5; // impt for the randomly generated weights
     opts.acyclic = true;
     // to be exactly representable in float,
-    // or this test fails because numerical differences can cause symmetry in 
+    // or this test fails because numerical differences can cause symmetry in
     // weights to be broken, which causes the wrong path to be chosen as far
     // as the string part is concerned.
-    
+
     VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
 
     bool sorted = TopSort(fst);
@@ -59,14 +59,10 @@ template<class Arc> void TestDeterminizeLatticePruned() {
     ILabelCompare<Arc> ilabel_comp;
     if (kaldi::Rand() % 2 == 0)
       ArcSort(fst, ilabel_comp);
-    
+
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> det_fst;
@@ -79,11 +75,7 @@ template<class Arc> void TestDeterminizeLatticePruned() {
 
       std::cout << "FST after lattice-determinizing is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       KALDI_ASSERT(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
@@ -95,27 +87,19 @@ template<class Arc> void TestDeterminizeLatticePruned() {
       VectorFst<Arc> pruned_fst(*fst);
       if (pruned_fst.NumStates() != 0)
         kaldi::PruneLattice(10.0, &pruned_fst);
-      
+
       VectorFst<CompactArc> compact_pruned_fst, compact_pruned_det_fst;
       ConvertLattice<Weight, Int>(pruned_fst, &compact_pruned_fst, false);
       std::cout << "Compact pruned FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
       ConvertLattice<Weight, Int>(det_fst, &compact_pruned_det_fst, false);
-      
+
       std::cout << "Compact version of determinized FST is:\n";
       {
-#ifdef HAVE_OPENFST_GE_10400
         FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
-#else
-        FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true);
-#endif
         fstprinter.Print(&std::cout, "standard output");
       }
 
@@ -138,22 +122,14 @@ template<class Arc> void TestDeterminizeLatticePruned2() {
     VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
     std::cout << "FST before lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     VectorFst<Arc> ofst;
     DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
     std::cout << "FST after lattice-determinizing is:\n";
     {
-#ifdef HAVE_OPENFST_GE_10400
       FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
-#else
-      FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true);
-#endif
       fstprinter.Print(&std::cout, "standard output");
     }
     delete fst;
diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc
index e38c62b3bfa..8c790e749a3 100644
--- a/src/lat/determinize-lattice-pruned.cc
+++ b/src/lat/determinize-lattice-pruned.cc
@@ -48,8 +48,8 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
   typedef CompactLatticeWeightTpl<Weight, IntType> CompactWeight;
   typedef ArcTpl<CompactWeight> CompactArc; // arc in compact, acceptor form of lattice
-  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice 
-  
+  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice
+
   // Output to standard FST with CompactWeightTpl<Weight> as its weight type (the
   // weight stores the original output-symbol strings).  If destroy == true,
   // release memory as we go (but we cannot output again).
@@ -123,7 +123,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     for (OutputStateId this_state_id = 0; this_state_id < nStates; this_state_id++) {
       OutputState &this_state = *(output_states_[this_state_id]);
       vector<TempArc> &this_vec(this_state.arcs);
-      
+
       typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
       for (; iter != end; ++iter) {
         const TempArc &temp_arc(*iter);
@@ -209,12 +209,12 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       ifst_ = NULL;
     }
     { MinimalSubsetHash tmp; tmp.swap(minimal_hash_); }
-    
+
     for (size_t i = 0; i < output_states_.size(); i++) {
       vector<Element> empty_subset;
       empty_subset.swap(output_states_[i]->minimal_subset);
     }
-    
+
     for (typename InitialSubsetHash::iterator iter = initial_hash_.begin();
          iter != initial_hash_.end(); ++iter)
       delete iter->first;
@@ -235,14 +235,14 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     { vector<pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
   }
-  
+
   ~LatticeDeterminizerPruned() {
     FreeMostMemory();
     FreeOutputStates();
     // rest is deleted by destructors.
   }
-  
-  void RebuildRepository() { // rebuild the string repository,    
+
+  void RebuildRepository() { // rebuild the string repository,
     // freeing stuff we don't need.. we call this when memory usage
     // passes a supplied threshold.  We need to accumulate all the
     // strings we need the repository to "remember", then tell it
@@ -281,10 +281,10 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                                      needed_strings.end()),
                          needed_strings.end()); // uniq the strings.
     KALDI_LOG << "Rebuilding repository.";
-    
+
     repository_.Rebuild(needed_strings);
   }
-  
+
   bool CheckMemoryUsage() {
     int32 repo_size = repository_.MemSize(),
         arcs_size = num_arcs_ * sizeof(TempArc),
@@ -299,7 +299,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
       KALDI_VLOG(2) << "Rebuilt repository in determinize-lattice: repository shrank from "
                     << repo_size << " to " << new_repo_size << " bytes (approximately)";
-      
+
       if (new_total_size > static_cast<int32>(opts_.max_mem * 0.8)) {
         // Rebuilding didn't help enough-- we need a margin to stop
         // having to rebuild too often.  We'll just return to the user at
@@ -325,7 +325,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     return true;
   }
-  
+
   bool Determinize(double *effective_beam) {
     KALDI_ASSERT(!determinized_);
     // This determinizes the input fst but leaves it in the "special format"
@@ -344,13 +344,13 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       // memory passed a user-specified threshold and cleanup failed
       //  to get it below that threshold.
       size_t num_states = output_states_.size();
-      if ((opts_.max_states > 0 && num_states > opts_.max_states) || 
-          (opts_.max_arcs > 0 && num_arcs_ > opts_.max_arcs) || 
+      if ((opts_.max_states > 0 && num_states > opts_.max_states) ||
+          (opts_.max_arcs > 0 && num_arcs_ > opts_.max_arcs) ||
           (num_states % 10 == 0 && !CheckMemoryUsage())) { // note: at some point
         // it was num_states % 100, not num_states % 10, but I encountered an example
         // where memory was exhausted before we reached state #100.
         KALDI_VLOG(1) << "Lattice determinization terminated but not "
-                      << " because of lattice-beam.  (#states, #arcs) is ( " 
+                      << " because of lattice-beam.  (#states, #arcs) is ( "
                       << output_states_.size() << ", " << num_arcs_
                       << " ), versus limits ( " << opts_.max_states << ", "
                       << opts_.max_arcs << " ) (else, may be memory limit).";
@@ -376,7 +376,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // arc or state limit.
   }
  private:
-  
+
   typedef typename Arc::Label Label;
   typedef typename Arc::StateId StateId;  // use this when we don't know if it's input or output.
   typedef typename Arc::StateId InputStateId;  // state in the input FST.
@@ -493,7 +493,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // these types are the same anyway].
   typedef unordered_map<const vector<Element>*, Element,
                         SubsetKey, SubsetEqual> InitialSubsetHash;
-  
+
 
   // converts the representation of the subset from canonical (all states) to
   // minimal (only states with output symbols on arcs leaving them, and final
@@ -511,7 +511,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     subset->resize(cur_out - subset->begin());
   }
-  
+
   // Takes a minimal, normalized subset, and converts it to an OutputStateId.
   // Involves a hash lookup, and possibly adding a new OutputStateId.
   // If it creates a new OutputStateId, it creates a new record for it, works
@@ -546,7 +546,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     return state_id;
   }
 
-  
+
   // Given a normalized initial subset of elements (i.e. before epsilon closure),
   // compute the corresponding output-state.
   OutputStateId InitialToStateId(const vector<Element> &subset_in,
@@ -573,7 +573,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     ConvertToMinimal(&subset); // remove all but emitting and final states.
 
     Element elem; // will be used to store remaining weight and string, and
-                 // OutputStateId, in initial_hash_;    
+                 // OutputStateId, in initial_hash_;
     NormalizeSubset(&subset, &elem.weight, &elem.string); // normalize subset; put
     // common string and weight in "elem".  The subset is now a minimal,
     // normalized subset.
@@ -584,7 +584,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     *common_prefix = elem.string;
     if (elem.weight == Weight::Zero())
       KALDI_WARN << "Zero weight!";
-    
+
     // Before returning "ans", add the initial subset to the hash,
     // so that we can bypass the epsilon-closure etc., next time
     // we process the same initial subset.
@@ -634,7 +634,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // at input, subset must have only one example of each StateId.  [will still
     // be so at output].  This function follows input-epsilons, and augments the
     // subset accordingly.
-    
+
     std::priority_queue<Element, vector<Element>, greater<Element> > queue;
     unordered_map<InputStateId, Element> cur_subset;
     typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
@@ -653,7 +653,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     while (queue.size() != 0) {
       Element elem = queue.top();
       queue.pop();
-      
+
       // The next if-statement is a kind of optimization.  It's to prevent us
       // unnecessarily repeating the processing of a state.  "cur_subset" always
       // contains only one Element with a particular state.  The issue is that
@@ -678,8 +678,8 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
           next_elem.state = arc.nextstate;
           next_elem.weight = Times(elem.weight, arc.weight);
           // next_elem.string is not set up yet... create it only
-          // when we know we need it (this is an optimization) 
-          
+          // when we know we need it (this is an optimization)
+
           MapIter iter = cur_subset.find(next_elem.state);
           if (iter == cur_subset.end()) {
             // was no such StateId: insert and add to queue.
@@ -695,10 +695,10 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
             if (comp == 0) { // A tie on weights.  This should be a rare case;
                              // we don't optimize for it.
               next_elem.string = (arc.olabel == 0 ? elem.string :
-                                  repository_.Successor(elem.string, 
+                                  repository_.Successor(elem.string,
                                                         arc.olabel));
               comp = Compare(next_elem.weight, next_elem.string,
-                             iter->second.weight, iter->second.string);              
+                             iter->second.weight, iter->second.string);
             }
             if(comp == 1) { // next_elem is better, so use its (weight, string)
               next_elem.string = (arc.olabel == 0 ? elem.string :
@@ -766,7 +766,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       temp_arc.string = final_string;
       temp_arc.weight = final_weight;
       state.arcs.push_back(temp_arc);
-      num_arcs_++;      
+      num_arcs_++;
     }
   }
 
@@ -808,11 +808,11 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // (weight, string) pair in the semiring).
   void MakeSubsetUnique(vector<Element> *subset) {
     typedef typename vector<Element>::iterator IterType;
-    
+
     // This KALDI_ASSERT is designed to fail (usually) if the subset is not sorted on
     // state.
     KALDI_ASSERT(subset->size() < 2 || (*subset)[0].state <= (*subset)[1].state);
-    
+
     IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
     size_t num_out = 0;
     // Merge elements with same state-id
@@ -835,7 +835,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     }
     subset->resize(num_out);
   }
-  
+
   // ProcessTransition was called from "ProcessTransitions" in the non-pruned
   // code, but now we in effect put the calls to ProcessTransition on a priority
   // queue, and it now gets called directly from Determinize().  This function
@@ -850,7 +850,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     Weight tot_weight;
     NormalizeSubset(subset, &tot_weight, &common_str);
     forward_cost += ConvertToCost(tot_weight);
-     
+
     OutputStateId nextstate;
     {
       Weight next_tot_weight;
@@ -876,7 +876,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
 
   // "less than" operator for pair<Label, Element>.   Used in ProcessTransitions.
-  // Lexicographical order, which only compares the state when ordering the 
+  // Lexicographical order, which only compares the state when ordering the
   // "Element" member of the pair.
 
   class PairComparator {
@@ -898,7 +898,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // states.  Partitions the emitting transitions up by ilabel (by sorting on
   // ilabel), and for each unique ilabel, it creates a Task record that contains
   // the information we need to process the transition.
-  
+
   void ProcessTransitions(OutputStateId output_state_id) {
     const vector<Element> &minimal_subset = output_states_[output_state_id]->minimal_subset;
     // it's possible that minimal_subset could be empty if there are
@@ -922,7 +922,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
             next_elem.weight = Times(elem.weight, arc.weight);
             if (arc.olabel == 0) // output epsilon
               next_elem.string = elem.string;
-            else 
+            else
               next_elem.string = repository_.Successor(elem.string, arc.olabel);
             all_elems.push_back(this_pr);
           }
@@ -953,7 +953,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                                        backward_costs_[element.state]);
         cur++;
       }
-      
+
       // After the command below, the "priority_cost" is a value comparable to
       // the total-weight of the input FST, like a total-path weight... of
       // course, it will typically be less (in the semiring) than that.
@@ -965,7 +965,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
         delete task;
       } else {
         MakeSubsetUnique(&(task->subset)); // remove duplicate Elements with the same state.
-        queue_.push(task); // Push the task onto the queue.  The queue keeps it      
+        queue_.push(task); // Push the task onto the queue.  The queue keeps it
         // in prioritized order, so we always process the one with the "best"
         // weight (highest in the semiring).
 
@@ -983,7 +983,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
     // empty.
   }
 
-  
+
   bool IsIsymbolOrFinal(InputStateId state) { // returns true if this state
     // of the input FST either is final or has an osymbol on an arc out of it.
     // Uses the vector isymbol_or_final_ as a cache for this info.
@@ -1029,13 +1029,13 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
 
     if (ifst_->Start() == kNoStateId) return; // we'll be returning
     // an empty FST.
-    
+
     double best_cost = backward_costs_[ifst_->Start()];
     if (best_cost == numeric_limits<double>::infinity())
       KALDI_WARN << "Total weight of input lattice is zero.";
     cutoff_ = best_cost + beam_;
   }
-  
+
   void InitializeDeterminization() {
     // We insist that the input lattice be topologically sorted.  This is not a
     // fundamental limitation of the algorithm (which in principle should be
@@ -1088,8 +1088,8 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
       // the queue, which we'll start processing in Determinize().
     }
   }
-  
-  DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizerPruned);
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizerPruned);
 
   struct OutputState {
     vector<Element> minimal_subset;
@@ -1106,23 +1106,23 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                 double forward_cost): minimal_subset(minimal_subset),
                                       forward_cost(forward_cost) { }
   };
-  
+
   vector<OutputState*> output_states_; // All the info about the output states.
-  
+
   int num_arcs_; // keep track of memory usage: number of arcs in output_states_[ ]->arcs
   int num_elems_; // keep track of memory usage: number of elems in output_states_ and
   // the keys of initial_hash_
-  
+
   const ExpandedFst<Arc> *ifst_;
   std::vector<double> backward_costs_; // This vector stores, for every state in ifst_,
   // the minimal cost to the end-state (i.e. the sum of weights; they are guaranteed to
   // have "take-the-minimum" semantics).  We get the double from the ConvertToCost()
   // function on the lattice weights.
-  
+
   double beam_;
   double cutoff_; // beam plus total-weight of input (and note, the weight is
   // guaranteed to be "tropical-like" so the sum does represent a min-cost.
-  
+
   DeterminizeLatticePrunedOptions opts_;
   SubsetKey hasher_;  // object that computes keys-- has no data members.
   SubsetEqual equal_;  // object that compares subsets-- only data member is delta_.
@@ -1141,7 +1141,7 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
                                      // normalize, there may be an extra weight
                                      // and string.  Owns the pointers
                                      // in its keys.
-  
+
   struct Task {
     OutputStateId state; // State from which we're processing the transition.
     Label label; // Label on the transition we're processing out of this state.
@@ -1164,15 +1164,15 @@ template<class Weight, class IntType> class LatticeDeterminizerPruned {
   // order according to the best weight of any path passing through these
   // determinized states... it's possible to work this out.
   std::priority_queue<Task*, vector<Task*>, TaskCompare> queue_;
-  
+
   vector<pair<Label, Element> > all_elems_tmp_; // temporary vector used in ProcessTransitions.
-  
+
   enum IsymbolOrFinal { OSF_UNKNOWN = 0, OSF_NO = 1, OSF_YES = 2 };
-  
+
   vector<char> isymbol_or_final_; // A kind of cache; it says whether
   // each state is (emitting or final) where emitting means it has at least one
   // non-epsilon output arc.  Only accessed by IsIsymbolOrFinal()
-  
+
   LatticeStringRepository<IntType> repository_;  // defines a compact and fast way of
   // storing sequences of labels.
 
@@ -1300,7 +1300,7 @@ typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
 
   // Work out the first phone symbol. This is more related to the phone
   // insertion function, so we put it here and make it the returning value of
-  // DeterminizeLatticeInsertPhones(). 
+  // DeterminizeLatticeInsertPhones().
   Label first_phone_label = HighestNumberedInputSymbol(*fst) + 1;
 
   // Insert phones here.
@@ -1373,7 +1373,7 @@ void DeterminizeLatticeDeletePhones(
 template
 void DeterminizeLatticeDeletePhones(
     ArcTpl<kaldi::LatticeWeight>::Label first_phone_label,
-    MutableFst<ArcTpl<kaldi::LatticeWeight> > *fst);    
+    MutableFst<ArcTpl<kaldi::LatticeWeight> > *fst);
 
 /** This function does a first pass determinization with phone symbols inserted
     at phone boundary. It uses a transition model to work out the transition-id
@@ -1396,7 +1396,7 @@ bool DeterminizeLatticePhonePrunedFirstPass(
   typename ArcTpl<Weight>::Label first_phone_label =
       DeterminizeLatticeInsertPhones(trans_model, fst);
   TopSort(fst);
-  
+
   // Second, do determinization with phone inserted.
   bool ans = DeterminizeLatticePruned<Weight>(*fst, beam, fst, opts);
 
@@ -1438,7 +1438,7 @@ bool DeterminizeLatticePhonePruned(
   // lattices.
   if (opts.phone_determinize) {
     KALDI_VLOG(1) << "Doing first pass of determinization on phone + word "
-                  << "lattices."; 
+                  << "lattices.";
     ans = DeterminizeLatticePhonePrunedFirstPass<Weight, IntType>(
         trans_model, beam, ifst, det_opts) && ans;
 
@@ -1513,14 +1513,14 @@ template
 bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
-    MutableFst<kaldi::CompactLatticeArc> *ofst, 
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
     DeterminizeLatticePrunedOptions opts);
 
 template
 bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
     const ExpandedFst<kaldi::LatticeArc> &ifst,
     double prune,
-    MutableFst<kaldi::LatticeArc> *ofst, 
+    MutableFst<kaldi::LatticeArc> *ofst,
     DeterminizeLatticePrunedOptions opts);
 
 template
diff --git a/src/lat/kaldi-lattice.cc b/src/lat/kaldi-lattice.cc
index ee58e64704d..744cc538462 100644
--- a/src/lat/kaldi-lattice.cc
+++ b/src/lat/kaldi-lattice.cc
@@ -75,15 +75,9 @@ bool WriteCompactLattice(std::ostream &os, bool binary,
     // on its own line.
     os << '\n';
     bool acceptor = true, write_one = false;
-#ifdef HAVE_OPENFST_GE_10400
     fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
                                                t.OutputSymbols(),
                                                NULL, acceptor, write_one, "\t");
-#else
-    fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
-                                               t.OutputSymbols(),
-                                               NULL, acceptor, write_one);
-#endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
@@ -131,7 +125,7 @@ class LatticeReader {
       if (col.size() > 5) {
         KALDI_WARN << "Reading lattice: bad line in FST: " << line;
         delete fst;
-        delete cfst;    
+        delete cfst;
         return PairT(static_cast<Lattice*>(NULL),
                      static_cast<CompactLattice*>(NULL));
       }
@@ -168,7 +162,7 @@ class LatticeReader {
             else fst->SetFinal(s, w);
             break;
           case 3: // 3 columns not ok for Lattice format; it's not an acceptor.
-            ok = false; 
+            ok = false;
             break;
           case 4:
             ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
@@ -253,7 +247,7 @@ class LatticeReader {
           SplitStringToVector(line, separator.c_str(), true, &col);
           if (col.empty()) break;
         }
-        return PairT(static_cast<Lattice*>(NULL), 
+        return PairT(static_cast<Lattice*>(NULL),
                      static_cast<CompactLattice*>(NULL));
       }
     }
@@ -406,15 +400,9 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
     // on its own line.
     os << '\n';
     bool acceptor = false, write_one = false;
-#ifdef HAVE_OPENFST_GE_10400
     fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
                                         t.OutputSymbols(),
                                         NULL, acceptor, write_one, "\t");
-#else
-    fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
-                                        t.OutputSymbols(),
-                                        NULL, acceptor, write_one);
-#endif
     printer.Print(&os, "<unknown>");
     if (os.fail())
       KALDI_WARN << "Stream failure detected.";
@@ -511,7 +499,7 @@ bool LatticeHolder::Read(std::istream &is) {
   } else {
     return ReadLattice(is, true, &t_);
   }
-}     
+}
 
 
 
diff --git a/src/lat/lattice-functions.h b/src/lat/lattice-functions.h
index c58b2ec32b8..c95af70d7eb 100644
--- a/src/lat/lattice-functions.h
+++ b/src/lat/lattice-functions.h
@@ -67,12 +67,12 @@ BaseFloat LatticeForwardBackward(const Lattice &lat,
 // the CompactLattice lattice format. Also we only need the alpha in the forward
 // path, not the posteriors.
 bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
-                                 vector<double> *alpha);
+                                 std::vector<double> *alpha);
 
 // A sibling of the function CompactLatticeAlphas()... We compute the beta from
 // the backward path here.
 bool ComputeCompactLatticeBetas(const CompactLattice &lat,
-                                vector<double> *beta);
+                                std::vector<double> *beta);
 
 
 // Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
@@ -82,8 +82,8 @@ bool ComputeCompactLatticeBetas(const CompactLattice &lat,
 template<typename LatticeType>
 double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
                                     bool viterbi,
-                                    vector<double> *alpha,
-                                    vector<double> *beta);
+                                    std::vector<double> *alpha,
+                                    std::vector<double> *beta);
 
 
 /// Topologically sort the compact lattice if not already topologically sorted.
@@ -321,4 +321,3 @@ void ComposeCompactLatticeDeterministic(
 }  // namespace kaldi
 
 #endif  // KALDI_LAT_LATTICE_FUNCTIONS_H_
-
diff --git a/src/lat/minimize-lattice.h b/src/lat/minimize-lattice.h
index 60acfb27353..fcf6c0f36df 100644
--- a/src/lat/minimize-lattice.h
+++ b/src/lat/minimize-lattice.h
@@ -40,6 +40,7 @@ namespace fst {
 /// function will not combine as many states as it could, but it won't crash.
 /// Returns true on success, and false if it failed due to topological sorting
 /// failing.
+/// The output will be topologically sorted.
 template<class Weight, class IntType>
 bool MinimizeCompactLattice(
     MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *clat,
diff --git a/src/lat/push-lattice-test.cc b/src/lat/push-lattice-test.cc
index e1f99bcb31f..cc9ae827a86 100644
--- a/src/lat/push-lattice-test.cc
+++ b/src/lat/push-lattice-test.cc
@@ -90,23 +90,13 @@ void TestPushCompactLatticeWeights() {
     }
     if (!ApproxEqual(sum, LatticeWeight::One())) {
       {
-#ifdef HAVE_OPENFST_GE_10400
         fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
                                                    NULL, true, true, "\t");
-#else
-        fst::FstPrinter<CompactLatticeArc> printer(clat2, NULL, NULL,
-                                                   NULL, true, true);
-#endif
         printer.Print(&std::cerr, "<unknown>");
       }
       {
-#ifdef HAVE_OPENFST_GE_10400
         fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
                                                    NULL, true, true, "\t");
-#else
-        fst::FstPrinter<CompactLatticeArc> printer(*clat, NULL, NULL,
-                                                   NULL, true, true);
-#endif
         printer.Print(&std::cerr, "<unknown>");
       }
       KALDI_ERR << "Bad lattice being pushed.";
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 53678efe844..e6fd0b61dd9 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -25,7 +25,7 @@ namespace kaldi {
 
 // this is Figure 6 in the paper.
 void MinimumBayesRisk::MbrDecode() {
-  
+
   for (size_t counter = 0; ; counter++) {
     NormalizeEps(&R_);
     AccStats(); // writes to gamma_
@@ -33,13 +33,13 @@ void MinimumBayesRisk::MbrDecode() {
 
     one_best_times_.clear();
     one_best_confidences_.clear();
-    
+
     // Caution: q in the line below is (q-1) in the algorithm
     // in the paper; both R_ and gamma_ are indexed by q-1.
     for (size_t q = 0; q < R_.size(); q++) {
-      if (do_mbr_) { // This loop updates R_ [indexed same as gamma_]. 
+      if (do_mbr_) { // This loop updates R_ [indexed same as gamma_].
         // gamma_[i] is sorted in reverse order so most likely one is first.
-        const vector<pair<int32, BaseFloat> > &this_gamma = gamma_[q];
+        const std::vector<std::pair<int32, BaseFloat> > &this_gamma = gamma_[q];
         double old_gamma = 0, new_gamma = this_gamma[0].second;
         int32 rq = R_[q], rhat = this_gamma[0].first; // rq: old word, rhat: new.
         for (size_t j = 0; j < this_gamma.size(); j++)
@@ -71,7 +71,7 @@ void MinimumBayesRisk::MbrDecode() {
 struct Int32IsZero {
   bool operator() (int32 i) { return (i == 0); }
 };
-// static 
+// static
 void MinimumBayesRisk::RemoveEps(std::vector<int32> *vec) {
   Int32IsZero pred;
   vec->erase(std::remove_if (vec->begin(), vec->end(), pred),
@@ -96,7 +96,7 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q,
                                       Vector<double> &alpha_dash_arc) {
   alpha(1) = 0.0; // = log(1).  Line 5.
   alpha_dash(1, 0) = 0.0; // Line 5.
-  for (int32 q = 1; q <= Q; q++) 
+  for (int32 q = 1; q <= Q; q++)
     alpha_dash(1, q) = alpha_dash(1, q-1) + l(0, r(q)); // Line 7.
   for (int32 n = 2; n <= N; n++) {
     double alpha_n = kLogZeroDouble;
@@ -132,7 +132,7 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q,
 // Figure 5 in the paper.
 void MinimumBayesRisk::AccStats() {
   using std::map;
-  
+
   int32 N = static_cast<int32>(pre_.size()) - 1,
       Q = static_cast<int32>(R_.size());
 
@@ -141,8 +141,8 @@ void MinimumBayesRisk::AccStats() {
   Vector<double> alpha_dash_arc(Q+1); // index 0...Q
   Matrix<double> beta_dash(N+1, Q+1); // index (1...N, 0...Q)
   Vector<double> beta_dash_arc(Q+1); // index 0...Q
-  vector<char> b_arc(Q+1); // integer in {1,2,3}; index 1...Q
-  vector<map<int32, double> > gamma(Q+1); // temp. form of gamma.
+  std::vector<char> b_arc(Q+1); // integer in {1,2,3}; index 1...Q
+  std::vector<map<int32, double> > gamma(Q+1); // temp. form of gamma.
   // index 1...Q [word] -> occ.
 
   // The tau arrays below are the sums over words of the tau_b
@@ -151,7 +151,7 @@ void MinimumBayesRisk::AccStats() {
   // the sausage bins, not specifically for the 1-best output.
   Vector<double> tau_b(Q+1), tau_e(Q+1);
 
-  double Ltmp = EditDistance(N, Q, alpha, alpha_dash, alpha_dash_arc); 
+  double Ltmp = EditDistance(N, Q, alpha, alpha_dash, alpha_dash_arc);
   if (L_ != 0 && Ltmp > L_) { // L_ != 0 is to rule out 1st iter.
     KALDI_WARN << "Edit distance increased: " << Ltmp << " > "
                << L_;
@@ -262,7 +262,7 @@ void MinimumBayesRisk::AccStats() {
       double avg = 0.5 * (times_[q-2].second + times_[q-1].first);
       times_[q-2].second = times_[q-1].first = avg;
     }
-  }  
+  }
 }
 
 void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
@@ -271,7 +271,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   CreateSuperFinal(clat); // Add super-final state to clat... this is
   // one of the requirements of the MBR algorithm, as mentioned in the
   // paper (i.e. just one final state).
-  
+
   // Topologically sort the lattice, if not already sorted.
   kaldi::uint64 props = clat->Properties(fst::kFstProperties, false);
   if (!(props & fst::kTopSorted)) {
@@ -283,7 +283,7 @@ void MinimumBayesRisk::PrepareLatticeAndInitStats(CompactLattice *clat) {
   state_times_.push_back(0); // we'll convert to 1-based numbering.
   for (size_t i = state_times_.size()-1; i > 0; i--)
     state_times_[i] = state_times_[i-1];
-  
+
   // Now we convert the information in "clat" into a special internal
   // format (pre_, post_ and arcs_) which allows us to access the
   // arcs preceding any given state.
@@ -343,9 +343,9 @@ MinimumBayesRisk::MinimumBayesRisk(const CompactLattice &clat_in, bool do_mbr):
     L_ = 0.0; // Set current edit-distance to 0 [just so we know
     // when we're on the 1st iter.]
   }
-  
+
   MbrDecode();
-  
+
 }
 
 MinimumBayesRisk::MinimumBayesRisk(const CompactLattice &clat_in,
diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc
index dc25fb351c6..ce9016d750c 100644
--- a/src/latbin/lattice-best-path.cc
+++ b/src/latbin/lattice-best-path.cc
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
         "Note: if you want output as FSTs, use lattice-1best; if you want output\n"
         "with acoustic and LM scores, use lattice-1best | nbest-to-linear\n"
         "Usage: lattice-best-path [options]  <lattice-rspecifier> [ <transcriptions-wspecifier> [ <alignments-wspecifier>] ]\n"
-        " e.g.: lattice-best-path --acoustic-scale=0.1 ark:1.lats ark:1.tra ark:1.ali\n";
+        " e.g.: lattice-best-path --acoustic-scale=0.1 ark:1.lats 'ark,t:|int2sym.pl -f 2- words.txt > text' ark:1.ali\n";
 
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
diff --git a/src/latbin/lattice-compose.cc b/src/latbin/lattice-compose.cc
index 5feb958a6a1..b9b261f7d36 100644
--- a/src/latbin/lattice-compose.cc
+++ b/src/latbin/lattice-compose.cc
@@ -86,9 +86,10 @@ int main(int argc, char *argv[]) {
         PropagateFinal(phi_label, fst2);
 
       fst::CacheOptions cache_opts(true, num_states_cache);
+      fst::MapFstOptions mapfst_opts(cache_opts);
       fst::StdToLatticeMapper<BaseFloat> mapper;
       fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
-          mapped_fst2(*fst2, mapper, cache_opts);
+          mapped_fst2(*fst2, mapper, mapfst_opts);
       for (; !lattice_reader1.Done(); lattice_reader1.Next()) {
         std::string key = lattice_reader1.Key();
         KALDI_VLOG(1) << "Processing lattice for key " << key;
diff --git a/src/latbin/lattice-depth.cc b/src/latbin/lattice-depth.cc
index 93dfd5c966b..9a785c4b6a6 100644
--- a/src/latbin/lattice-depth.cc
+++ b/src/latbin/lattice-depth.cc
@@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
     using fst::VectorFst;
     using fst::StdArc;
     typedef StdArc::StateId StateId;
-    
+
     const char *usage =
         "Compute the lattice depths in terms of the average number of arcs that\n"
         "cross a frame.  See also lattice-depth-per-frame\n"
@@ -42,7 +42,7 @@ int main(int argc, char *argv[]) {
         "E.g.: lattice-depth ark:- ark,t:-\n";
 
     ParseOptions po(usage);
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 1 || po.NumArgs() > 2) {
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
       std::string key = clat_reader.Key();
 
       TopSortCompactLatticeIfNeeded(&clat);
-      
+
       int32 t;
       BaseFloat depth = CompactLatticeDepth(clat, &t);
 
diff --git a/src/latbin/lattice-determinize-non-compact.cc b/src/latbin/lattice-determinize-non-compact.cc
index f79262e0832..44ae8566f86 100644
--- a/src/latbin/lattice-determinize-non-compact.cc
+++ b/src/latbin/lattice-determinize-non-compact.cc
@@ -56,9 +56,9 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
       KALDI_WARN << "Detected empty lattice, skipping " << key;
       return false;
     }
-    
-    // The work gets done in the next line.  
-    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+
+    // The work gets done in the next line.
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) {
       if (prune) PruneLattice(cur_beam, clat);
       return true;
     } else { // failed to determinize..
@@ -91,14 +91,14 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
 }
 
 void ComputeAcousticScoresMap(
-    const Lattice &lat, 
-    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+    const Lattice &lat,
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                         PairHasher<int32> > *acoustic_scores) {
   acoustic_scores->clear();
 
   std::vector<int32> state_times;
   LatticeStateTimes(lat, &state_times);
-  
+
   KALDI_ASSERT(lat.Start() == 0);
 
   for (StateId s = 0; s < lat.NumStates(); s++) {
@@ -111,17 +111,17 @@ void ComputeAcousticScoresMap(
       int32 tid = arc.ilabel;
 
       if (tid != 0) {
-        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
           PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
         if (it == acoustic_scores->end()) {
-          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid), 
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid),
                                           std::make_pair(weight.Value2(), 1)));
         } else {
-          if (it->second.second == 2 
+          if (it->second.second == 2
                 && it->second.first / it->second.second != weight.Value2()) {
             KALDI_VLOG(2) << "Transitions on the same frame have different "
-                          << "acoustic costs for tid " << tid << "; " 
-                          << it->second.first / it->second.second 
+                          << "acoustic costs for tid " << tid << "; "
+                          << it->second.first / it->second.second
                           << " vs " << weight.Value2();
           }
           it->second.first += weight.Value2();
@@ -135,7 +135,7 @@ void ComputeAcousticScoresMap(
 
     LatticeWeight f = lat.Final(s);
     if (f != LatticeWeight::Zero()) {
-      // Final acoustic cost must be 0 as we are reading from 
+      // Final acoustic cost must be 0 as we are reading from
       // non-determinized, non-compact lattice
       KALDI_ASSERT(f.Value2() == 0.0);
     }
@@ -143,25 +143,25 @@ void ComputeAcousticScoresMap(
 }
 
 void ReplaceAcousticScoresFromMap(
-    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
                                         PairHasher<int32> > &acoustic_scores,
     Lattice *lat) {
   fst::TopSort(lat);
-  
+
   std::vector<int32> state_times;
   LatticeStateTimes(*lat, &state_times);
-  
+
   KALDI_ASSERT(lat->Start() == 0);
 
   for (StateId s = 0; s < lat->NumStates(); s++) {
     int32 t = state_times[s];
-    for (fst::MutableArcIterator<Lattice> aiter(lat, s); 
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
           !aiter.Done(); aiter.Next()) {
       Arc arc(aiter.Value());
- 
+
       int32 tid = arc.ilabel;
       if (tid != 0) {
-        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>, 
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
           PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
         if (it == acoustic_scores.end()) {
           KALDI_ERR << "Could not find tid " << tid << " at time " << t
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: lattice-determinize-non-compact [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-determinize-non-compact --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
@@ -218,7 +218,7 @@ int main(int argc, char *argv[]) {
     BaseFloat delta = fst::kDelta;
     bool prune = false;
     bool minimize = false;
-    
+
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
     po.Register("beam", &beam,
@@ -238,7 +238,7 @@ int main(int argc, char *argv[]) {
                 "decrease beam by beam-ratio if determinization fails.");
     po.Register("minimize", &minimize,
                 "If true, push and minimize after determinization");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -252,12 +252,16 @@ int main(int argc, char *argv[]) {
     // Read as regular lattice-- this is the form we need it in for efficient
     // pruning.
     SequentialLatticeReader lattice_reader(lats_rspecifier);
-    
+
     // Write as regular lattice.
-    LatticeWriter lattice_writer(lats_wspecifier); 
+    LatticeWriter lattice_writer(lats_wspecifier);
 
     int32 n_done = 0, n_error = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
     LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
@@ -265,21 +269,21 @@ int main(int argc, char *argv[]) {
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
-      
+
       lattice_reader.FreeCurrent();
-      
+
       fst::TopSort(&lat);
-      
+
       fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
 
 
-      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count) 
-      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>, 
+      // Compute a map from each (t, tid) to (sum_of_acoustic_scores, count)
+      unordered_map<std::pair<int32,int32>, std::pair<BaseFloat, int32>,
                                           PairHasher<int32> > acoustic_scores;
       ComputeAcousticScoresMap(lat, &acoustic_scores);
-      
+
       Invert(&lat); // make it so word labels are on the input.
-      
+
       CompactLattice clat;
       if (DeterminizeLatticeWrapper(lat, key, prune,
                                     beam, beam_ratio, max_mem, max_loop,
@@ -290,6 +294,13 @@ int main(int argc, char *argv[]) {
           MinimizeCompactLattice(&clat);
         }
 
+        int32 t;
+        TopSortCompactLatticeIfNeeded(&clat);
+        double depth = CompactLatticeDepth(clat, &t);
+        sum_depth_in += lat.NumStates();
+        sum_depth_out += depth * t;
+        sum_t += t;
+
         Lattice out_lat;
         fst::ConvertLattice(clat, &out_lat);
         fst::TopSort(&out_lat);
@@ -298,7 +309,7 @@ int main(int argc, char *argv[]) {
         // the computed map
         ReplaceAcousticScoresFromMap(acoustic_scores, &out_lat);
 
-        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), 
+        fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale),
                           &out_lat);
         lattice_writer.Write(key, out_lat);
         n_done++;
@@ -307,6 +318,12 @@ int main(int argc, char *argv[]) {
       }
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t <<  "frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
@@ -314,4 +331,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/latbin/lattice-determinize-phone-pruned.cc b/src/latbin/lattice-determinize-phone-pruned.cc
index 8df4bda1e1a..0959bcbcd74 100644
--- a/src/latbin/lattice-determinize-phone-pruned.cc
+++ b/src/latbin/lattice-determinize-phone-pruned.cc
@@ -28,7 +28,7 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     typedef kaldi::int32 int32;
-    
+
     const char *usage =
         "Determinize lattices, keeping only the best path (sequence of\n"
         "acoustic states) for each input-symbol sequence. This version does\n"
@@ -41,13 +41,13 @@ int main(int argc, char *argv[]) {
         "                  <lattice-rspecifier> <lattice-wspecifier>\n"
         " e.g.: lattice-determinize-phone-pruned --acoustic-scale=0.1 \\\n"
         "                            final.mdl ark:in.lats ark:det.lats\n";
-    
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
     fst::DeterminizeLatticePhonePrunedOptions opts;
     opts.max_mem = 50000000;
-    
+
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic"
                 " likelihoods.");
     po.Register("beam", &beam, "Pruning beam [applied after acoustic scaling].");
@@ -69,12 +69,16 @@ int main(int argc, char *argv[]) {
     // Reads as regular lattice-- this is the form the determinization code
     // accepts.
     SequentialLatticeReader lat_reader(lats_rspecifier);
-    
+
     // Writes as compact lattice.
-    CompactLatticeWriter compact_lat_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lat_writer(lats_wspecifier);
 
     int32 n_done = 0, n_warn = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
 
@@ -95,11 +99,24 @@ int main(int argc, char *argv[]) {
         n_warn++;
       }
 
+      int32 t;
+      TopSortCompactLatticeIfNeeded(&det_clat);
+      double depth = CompactLatticeDepth(det_clat, &t);
+      sum_depth_in += lat.NumStates();
+      sum_depth_out += depth * t;
+      sum_t += t;
+
       fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
       compact_lat_writer.Write(key, det_clat);
       n_done++;
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t << " frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, determinization finished "
               << "earlier than specified by the beam on " << n_warn << " of "
               << "these.";
diff --git a/src/latbin/lattice-determinize-pruned.cc b/src/latbin/lattice-determinize-pruned.cc
index 1e6fa2d6de2..3e8bca5a3ce 100644
--- a/src/latbin/lattice-determinize-pruned.cc
+++ b/src/latbin/lattice-determinize-pruned.cc
@@ -74,6 +74,10 @@ int main(int argc, char *argv[]) {
 
     int32 n_done = 0, n_warn = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
 
@@ -109,11 +113,25 @@ int main(int argc, char *argv[]) {
         PushCompactLatticeWeights(&det_clat);
         MinimizeCompactLattice(&det_clat);
       }
+
+      int32 t;
+      TopSortCompactLatticeIfNeeded(&det_clat);
+      double depth = CompactLatticeDepth(det_clat, &t);
+      sum_depth_in += lat.NumStates();
+      sum_depth_out += depth * t;
+      sum_t += t;
+
       fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &det_clat);
       compact_lat_writer.Write(key, det_clat);
       n_done++;
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t << " frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, determinization finished "
               << "earlier than specified by the beam (or output was empty) on "
               << n_warn << " of these.";
diff --git a/src/latbin/lattice-determinize.cc b/src/latbin/lattice-determinize.cc
index 8a5bd93e503..d59fcda7022 100644
--- a/src/latbin/lattice-determinize.cc
+++ b/src/latbin/lattice-determinize.cc
@@ -50,9 +50,9 @@ bool DeterminizeLatticeWrapper(const Lattice &lat,
       KALDI_WARN << "Detected empty lattice, skipping " << key;
       return false;
     }
-    
-    // The work gets done in the next line.  
-    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) { 
+
+    // The work gets done in the next line.
+    if (DeterminizeLattice(lat, clat, lat_opts, NULL)) {
       if (prune) PruneLattice(cur_beam, clat);
       return true;
     } else { // failed to determinize..
@@ -104,7 +104,7 @@ int main(int argc, char *argv[]) {
         "\n"
         "Usage: lattice-determinize [options] lattice-rspecifier lattice-wspecifier\n"
         " e.g.: lattice-determinize --acoustic-scale=0.1 --beam=15.0 ark:1.lats ark:det.lats\n";
-      
+
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
     BaseFloat beam = 10.0;
@@ -115,7 +115,7 @@ int main(int argc, char *argv[]) {
     BaseFloat delta = fst::kDelta;
     bool prune = false;
     bool minimize = false;
-    
+
     po.Register("acoustic-scale", &acoustic_scale,
                 "Scaling factor for acoustic likelihoods");
     po.Register("beam", &beam,
@@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
                 "decrease beam by beam-ratio if determinization fails.");
     po.Register("minimize", &minimize,
                 "If true, push and minimize after determinization");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -150,12 +150,16 @@ int main(int argc, char *argv[]) {
     // Read as regular lattice-- this is the form we need it in for efficient
     // pruning.
     SequentialLatticeReader lattice_reader(lats_rspecifier);
-    
+
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 n_done = 0, n_error = 0;
 
+    // depth stats (for diagnostics).
+    double sum_depth_in = 0.0,
+          sum_depth_out = 0.0, sum_t = 0.0;
+
     if (acoustic_scale == 0.0)
       KALDI_ERR << "Do not use a zero acoustic scale (cannot be inverted)";
     LatticeWeight beam_weight(beam, static_cast<BaseFloat>(0.0));
@@ -164,7 +168,7 @@ int main(int argc, char *argv[]) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
       Invert(&lat); // make it so word labels are on the input.
-      
+
       lattice_reader.FreeCurrent();
       fst::ScaleLattice(fst::AcousticLatticeScale(acoustic_scale), &lat);
 
@@ -177,6 +181,14 @@ int main(int argc, char *argv[]) {
           PushCompactLatticeWeights(&clat);
           MinimizeCompactLattice(&clat);
         }
+
+        int32 t;
+        TopSortCompactLatticeIfNeeded(&clat);
+        double depth = CompactLatticeDepth(clat, &t);
+        sum_depth_in += lat.NumStates();
+        sum_depth_out += depth * t;
+        sum_t += t;
+
         fst::ScaleLattice(fst::AcousticLatticeScale(1.0/acoustic_scale), &clat);
         compact_lattice_writer.Write(key, clat);
         n_done++;
@@ -185,6 +197,12 @@ int main(int argc, char *argv[]) {
       }
     }
 
+    if (sum_t != 0.0) {
+      KALDI_LOG << "Average input-lattice depth (measured at at state level) is "
+                << (sum_depth_in / sum_t) << ", output depth is "
+                << (sum_depth_out / sum_t) << ", over " << sum_t << " frames "
+                << " (average num-frames = " << (sum_t / n_done) << ").";
+    }
     KALDI_LOG << "Done " << n_done << " lattices, errors on " << n_error;
     return (n_done != 0 ? 0 : 1);
   } catch(const std::exception &e) {
diff --git a/src/latbin/lattice-lmrescore.cc b/src/latbin/lattice-lmrescore.cc
index b8f1067e607..2e5406f75de 100644
--- a/src/latbin/lattice-lmrescore.cc
+++ b/src/latbin/lattice-lmrescore.cc
@@ -75,9 +75,10 @@ int main(int argc, char *argv[]) {
     // with all the cost on the first member of the pair (since it's a graph
     // weight).
     fst::CacheOptions cache_opts(true, num_states_cache);
+    fst::MapFstOptions mapfst_opts(cache_opts);
     fst::StdToLatticeMapper<BaseFloat> mapper;
     fst::MapFst<StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >
-        lm_fst(*std_lm_fst, mapper, cache_opts);
+        lm_fst(*std_lm_fst, mapper, mapfst_opts);
     delete std_lm_fst;
 
     // The next fifteen or so lines are a kind of optimization and
diff --git a/src/latbin/lattice-mbr-decode.cc b/src/latbin/lattice-mbr-decode.cc
index 465f4e35fbd..fba5daa4dd8 100644
--- a/src/latbin/lattice-mbr-decode.cc
+++ b/src/latbin/lattice-mbr-decode.cc
@@ -43,8 +43,8 @@ int main(int argc, char *argv[]) {
         "Usage: lattice-mbr-decode [options]  lattice-rspecifier "
         "transcriptions-wspecifier [ bayes-risk-wspecifier "
         "[ sausage-stats-wspecifier [ times-wspecifier] ] ] \n"
-        " e.g.: lattice-mbr-decode --acoustic-scale=0.1 ark:1.lats ark:1.tra "
-        "ark:/dev/null ark:1.sau\n";
+        " e.g.: lattice-mbr-decode --acoustic-scale=0.1 ark:1.lats "
+        "'ark,t:|int2sym.pl -f 2- words.txt > text' ark:/dev/null ark:1.sau\n";
     
     ParseOptions po(usage);
     BaseFloat acoustic_scale = 1.0;
diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc
index 799a7f6ce67..80c4e3e05d4 100644
--- a/src/latbin/lattice-oracle.cc
+++ b/src/latbin/lattice-oracle.cc
@@ -67,7 +67,7 @@ void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) {
       LabelSet::const_iterator it = wildcards.find(arc.ilabel);
       if (it != wildcards.end()) {
         KALDI_VLOG(4) << "MapWildCards: mapping symbol " << arc.ilabel
-                      << " to epsilon" << endl;
+                      << " to epsilon" << std::endl;
         arc.ilabel = 0;
       }
       it = wildcards.find(arc.olabel);
@@ -173,7 +173,7 @@ void CountErrors(const fst::StdVectorFst &fst,
 bool CheckFst(const fst::StdVectorFst &fst, string name, string key) {
 #ifdef DEBUG
   StateId numstates = fst.NumStates();
-  cerr << " " << name << " has " <<numstates << " states" <<endl;
+  std::cerr << " " << name << " has " << numstates << " states" << std::endl;
   std::stringstream ss;
   ss << name << key << ".fst";
   fst.Write(ss.str());
@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       const Lattice &lat = lattice_reader.Value();
-      cerr << "Lattice " << key << " read." << endl;
+      std::cerr << "Lattice " << key << " read." << std::endl;
 
       // remove all weights while creating a standard FST
       VectorFst<StdArc> lattice_fst;
diff --git a/src/latbin/lattice-rescore-mapped.cc b/src/latbin/lattice-rescore-mapped.cc
index 4dd8dfd875c..9dcc63219ee 100644
--- a/src/latbin/lattice-rescore-mapped.cc
+++ b/src/latbin/lattice-rescore-mapped.cc
@@ -1,7 +1,7 @@
 // latbin/lattice-rescore-mapped.cc
 
 // Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
-//                       Johns Hopkins University (author: Daniel Povey)   
+//                       Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -91,7 +91,7 @@ int main(int argc, char *argv[]) {
         "Usage: lattice-rescore-mapped [options] <transition-model-in> <lattice-rspecifier> "
         "<loglikes-rspecifier> <lattice-wspecifier>\n"
         " e.g.: nnet-logprob [args] .. | lattice-rescore-mapped final.mdl ark:1.lats ark:- ark:2.lats\n";
-    
+
     kaldi::BaseFloat old_acoustic_scale = 0.0;
     kaldi::ParseOptions po(usage);
     po.Register("old-acoustic-scale", &old_acoustic_scale,
@@ -116,12 +116,12 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       // Ignore what follows it in the model.
     }
-    
+
     RandomAccessBaseFloatMatrixReader loglike_reader(loglike_rspecifier);
     // Read as regular lattice
     SequentialLatticeReader lattice_reader(lats_rspecifier);
     // Write as compact lattice.
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier); 
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
 
     int32 num_done = 0, num_err = 0;
     int64 num_frames = 0;
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
           KALDI_ERR << "Cycles detected in lattice.";
       }
 
-      vector<int32> state_times;
+      std::vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(lat, &state_times);
       const Matrix<BaseFloat> &log_likes = loglike_reader.Value(key);
       if (log_likes.NumRows() != max_time) {
@@ -154,7 +154,7 @@ int main(int argc, char *argv[]) {
         num_err++;
         continue;
       }
-      
+
       kaldi::LatticeAcousticRescore(trans_model, log_likes, state_times,
                                     &lat);
       CompactLattice clat_out;
diff --git a/src/latbin/linear-to-nbest.cc b/src/latbin/linear-to-nbest.cc
index fd025f382b6..a1864d0d14a 100644
--- a/src/latbin/linear-to-nbest.cc
+++ b/src/latbin/linear-to-nbest.cc
@@ -67,7 +67,8 @@ int main(int argc, char *argv[]) {
         "<nbest-wspecifier>\n"
         "Note: if the rspecifiers for lm-cost or ac-cost are the empty string,\n"
         "these value will default to zero.\n"
-        " e.g.: linear-to-nbest ark:1.ali ark:1.tra ark:1.lmscore ark:1.acscore "
+        " e.g.: linear-to-nbest ark:1.ali 'ark:sym2int.pl -f 2- words.txt text|' "
+        "ark:1.lmscore ark:1.acscore "
         "ark:1.nbest\n";
 
     ParseOptions po(usage);
diff --git a/src/latbin/nbest-to-linear.cc b/src/latbin/nbest-to-linear.cc
index 79da978e086..d63c380133a 100644
--- a/src/latbin/nbest-to-linear.cc
+++ b/src/latbin/nbest-to-linear.cc
@@ -39,8 +39,8 @@ int main(int argc, char *argv[]) {
         "Usage: nbest-to-linear [options] <nbest-rspecifier> <alignments-wspecifier> "
         "[<transcriptions-wspecifier> [<lm-cost-wspecifier> [<ac-cost-wspecifier>]]]\n"
         " e.g.: lattice-to-nbest --n=10 ark:1.lats ark:- | \\\n"
-        "   nbest-to-linear ark:1.lats ark,t:1.ali ark,t:1.tra\n";
-    
+        "   nbest-to-linear ark:1.lats ark,t:1.ali 'ark,t:|int2sym.pl -f 2- words.txt > text'\n";
+
     ParseOptions po(usage);
 
     po.Read(argc, argv);
@@ -62,17 +62,17 @@ int main(int argc, char *argv[]) {
     Int32VectorWriter trans_writer(trans_wspecifier);
     BaseFloatWriter lm_cost_writer(lm_cost_wspecifier);
     BaseFloatWriter ac_cost_writer(ac_cost_wspecifier);
-    
+
     int32 n_done = 0, n_err = 0;
-    
+
     for (; !lattice_reader.Done(); lattice_reader.Next()) {
       std::string key = lattice_reader.Key();
       Lattice lat = lattice_reader.Value();
 
-      vector<int32> ilabels;
-      vector<int32> olabels;
+      std::vector<int32> ilabels;
+      std::vector<int32> olabels;
       LatticeWeight weight;
-      
+
       if (!GetLinearSymbolSequence(lat, &ilabels, &olabels, &weight)) {
         KALDI_WARN << "Lattice/nbest for key " << key << " had wrong format: "
             "note, this program expects input with one path, e.g. from "
diff --git a/src/lmbin/arpa2fst.cc b/src/lmbin/arpa2fst.cc
old mode 100755
new mode 100644
diff --git a/src/makefiles/android_openblas.mk b/src/makefiles/android_openblas.mk
index f628c0400a1..e1dea65a881 100644
--- a/src/makefiles/android_openblas.mk
+++ b/src/makefiles/android_openblas.mk
@@ -1,64 +1,42 @@
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
+# OpenBLAS specific Android configuration
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
 ifndef OPENFSTLIBS
 $(error OPENFSTLIBS not defined.)
 endif
-
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
+endif
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
-
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
-endif
-
 ifndef ANDROIDINC
 $(error ANDROIDINC not defined.)
 endif
 
- CXXFLAGS += -mhard-float -D_NDK_MATH_NO_SOFTFP=1  -Wall -I.. \
-      -pthread -mfpu=neon -ftree-vectorize -mfloat-abi=hard \
-      -DHAVE_OPENBLAS -DANDROID_BUILD -I $(OPENBLASROOT)/include \
-      -I$(ANDROIDINC) \
-      -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \
-      -Wno-sign-compare -Winit-self \
-       -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifneq ($(findstring clang,$(COMPILER)),clang)
+$(error Android build does not support compiling with $(CXX).
+        Supported compilers: clang++)
 endif
 
-LDFLAGS = -Wl,--no-warn-mismatch -pie
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -ldl -lm_hard
-
-CC = clang++
-CXX = clang++
-AR = ar
-AS = as
-RANLIB = ranlib
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-endif
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self -Wno-mismatched-tags \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_CXXABI_H -DHAVE_OPENBLAS -DANDROID_BUILD \
+           -I$(OPENBLASINC) -I$(ANDROIDINC) -ftree-vectorize -mfloat-abi=hard \
+           -mfpu=neon -mhard-float -D_NDK_MATH_NO_SOFTFP=1 -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
 endif
 
-
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -Wl,--no-warn-mismatch -pie
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm_hard -ldl
diff --git a/src/makefiles/common.mk b/src/makefiles/common.mk
deleted file mode 100644
index 3a464ea99a1..00000000000
--- a/src/makefiles/common.mk
+++ /dev/null
@@ -1,13 +0,0 @@
-# Rules that enable valgrind debugging ("make valgrind")
-
-valgrind: .valgrind
-
-.valgrind:
-	echo -n > valgrind.out
-	for x in $(TESTFILES); do echo $$x>>valgrind.out; valgrind ./$$x >/dev/null 2>> valgrind.out; done
-	! ( grep 'ERROR SUMMARY' valgrind.out | grep -v '0 errors' )
-	! ( grep 'definitely lost' valgrind.out | grep -v -w 0 )
-	rm valgrind.out
-	touch .valgrind
-
-
diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk
index 38d810acaa8..c6bba9669ea 100644
--- a/src/makefiles/cuda_32bit.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,7 +1,9 @@
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
-
+ifndef CUDATKDIR
+$(error CUDATKDIR not defined.)
+endif
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \
diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk
index fc11c034d78..89696253c84 100644
--- a/src/makefiles/cuda_64bit.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,8 +1,9 @@
-
 ifndef DOUBLE_PRECISION
 $(error DOUBLE_PRECISION not defined.)
 endif
-
+ifndef CUDATKDIR
+$(error CUDATKDIR not defined.)
+endif
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
@@ -10,4 +11,3 @@ CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
 CUDA_LDLIBS += -lcublas -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
-
diff --git a/src/makefiles/cuda_ppc64le.mk b/src/makefiles/cuda_ppc64le.mk
deleted file mode 100644
index 3941de6a230..00000000000
--- a/src/makefiles/cuda_ppc64le.mk
+++ /dev/null
@@ -1,12 +0,0 @@
-
-ifndef DOUBLE_PRECISION
-$(error DOUBLE_PRECISION not defined.)
-endif
-
-
-CUDA_INCLUDE= -I$(CUDATKDIR)/include
-CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \
-             -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
-CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-CUDA_LDLIBS += -lcublas -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule
diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk
index 6da982e20a4..c58cd3a42da 100644
--- a/src/makefiles/cygwin.mk
+++ b/src/makefiles/cygwin.mk
@@ -1,27 +1,28 @@
-# makefiles/kaldi.mk.cygwin contains Cygwin-specific rules
+# Cygwin configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
 endif
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-    -DHAVE_CLAPACK -I ../../tools/CLAPACK/ \
-    -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-    -I ../../tools/CLAPACK/ \
-    -I $(FSTROOT)/include \
-    $(EXTRA_CXXFLAGS) \
-    -g # -O0 -DKALDI_PARANOID
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_CLAPACK -I../../tools/CLAPACK/ \
+           -msse -msse2 \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -g --enable-auto-import
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -L/usr/lib/lapack \
-         --enable-auto-import -lcyglapack-0 -lcygblas-0 -lm -lpthread
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
-
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g \
+          --enable-auto-import -L/usr/lib/lapack
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -lcyglapack-0 -lcygblas-0 \
+         -lm -lpthread -ldl
diff --git a/src/makefiles/darwin.mk b/src/makefiles/darwin.mk
new file mode 100644
index 00000000000..81351d185b6
--- /dev/null
+++ b/src/makefiles/darwin.mk
@@ -0,0 +1,36 @@
+# Darwin (macOS) configuration
+
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK \
+           -msse -msse2 -pthread \
+           -g # -O0 -DKALDI_PARANOID
+
+ifeq ($(KALDI_FLAVOR), dynamic)
+CXXFLAGS += -fPIC
+endif
+
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+else ifeq ($(findstring GCC,$(COMPILER)),GCC)
+# Allow implicit conversions between vectors.
+CXXFLAGS += -flax-vector-conversions
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -g
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) -framework Accelerate -lm -lpthread -ldl
diff --git a/src/makefiles/darwin_10_10.mk b/src/makefiles/darwin_10_10.mk
deleted file mode 100644
index dcb35b0c59e..00000000000
--- a/src/makefiles/darwin_10_10.mk
+++ /dev/null
@@ -1,46 +0,0 @@
-# makefiles/darwin_10_10.mk contains Darwin-specific rules for OS X 10.10.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
diff --git a/src/makefiles/darwin_10_11.mk b/src/makefiles/darwin_10_11.mk
deleted file mode 100644
index 73cd006735e..00000000000
--- a/src/makefiles/darwin_10_11.mk
+++ /dev/null
@@ -1,46 +0,0 @@
-# makefiles/darwin_10_11.mk contains Darwin-specific rules for OS X 10.11.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
diff --git a/src/makefiles/darwin_10_12.mk b/src/makefiles/darwin_10_12.mk
deleted file mode 100644
index 68f50f01d51..00000000000
--- a/src/makefiles/darwin_10_12.mk
+++ /dev/null
@@ -1,46 +0,0 @@
-# makefiles/darwin_10_12.mk contains Darwin-specific rules for OS X 10.12.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) -Wno-unused-local-typedef \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
diff --git a/src/makefiles/darwin_10_5.mk b/src/makefiles/darwin_10_5.mk
deleted file mode 100644
index 5a1353b3893..00000000000
--- a/src/makefiles/darwin_10_5.mk
+++ /dev/null
@@ -1,27 +0,0 @@
-# makefiles/darwin_10_5.mk contains Darwin-specific rules for OS X 10.5.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	  -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION)  \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -gdwarf-2 # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -gdwarf-2
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++-4
-CC = g++-4
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_6.mk b/src/makefiles/darwin_10_6.mk
deleted file mode 100644
index 50883335a9d..00000000000
--- a/src/makefiles/darwin_10_6.mk
+++ /dev/null
@@ -1,27 +0,0 @@
-# makefiles/darwin_10_6.mk contains Darwin-specific rules for OS X 10.6.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	  -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g -rdynamic
-LDLIBS =  $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_7.mk b/src/makefiles/darwin_10_7.mk
deleted file mode 100644
index ad5a153f5a9..00000000000
--- a/src/makefiles/darwin_10_7.mk
+++ /dev/null
@@ -1,28 +0,0 @@
-# makefiles/darwin_10_6.mk contains Darwin-specific rules for OS X 10.7.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	  -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g -rdynamic
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_8.mk b/src/makefiles/darwin_10_8.mk
deleted file mode 100644
index c89aea0f44f..00000000000
--- a/src/makefiles/darwin_10_8.mk
+++ /dev/null
@@ -1,28 +0,0 @@
-# makefiles/darwin_10_8.mk contains Darwin-specific rules for OS X 10.8.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-	   -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -rdynamic \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g -rdynamic
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = g++
-RANLIB = ranlib
-AR = ar
diff --git a/src/makefiles/darwin_10_9.mk b/src/makefiles/darwin_10_9.mk
deleted file mode 100644
index 0069372c8ef..00000000000
--- a/src/makefiles/darwin_10_9.mk
+++ /dev/null
@@ -1,46 +0,0 @@
-# makefiles/darwin_10_9.mk contains Darwin-specific rules for OS X 10.9.*
-
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
-DOUBLE_PRECISION = 0
-CXXFLAGS += -msse -msse2 -Wall -I.. \
-      -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Winit-self \
-      -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
-
-
-ifeq ($(KALDI_FLAVOR), dynamic)
-CXXFLAGS += -fPIC
-endif
-
-LDFLAGS = -g
-LDLIBS = $(EXTRA_LDLIBS) $(FSTROOT)/lib/libfst.a -ldl -lm -lpthread -framework Accelerate
-CXX = g++
-CC = $(CXX)
-RANLIB = ranlib
-AR = ar
-
-# Add no-mismatched-tags flag to suppress the annoying clang warnings
-# that are perfectly valid per spec.
-COMPILER = $(shell $(CXX) -v 2>&1 )
-ifeq ($(findstring clang,$(COMPILER)),clang)
-  CXXFLAGS += -Wno-mismatched-tags
-  # Link with libstdc++ if we are building against OpenFst < 1.4
-  ifneq ("$(OPENFST_GE_10400)","1")
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-endif
-
-# We need to tell recent versions of g++ to allow vector conversions without
-# an explicit cast provided the vectors are of the same size.
-ifeq ($(findstring GCC,$(COMPILER)),GCC)
-	CXXFLAGS += -flax-vector-conversions -Wno-unused-local-typedefs
-endif
diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk
index 17f122622f1..34abd905924 100644
--- a/src/makefiles/default_rules.mk
+++ b/src/makefiles/default_rules.mk
@@ -3,28 +3,20 @@ SHELL := /bin/bash
 
 ifeq ($(KALDI_FLAVOR), dynamic)
   ifeq ($(shell uname), Darwin)
-    XLDLIBS := $(LDLIBS)
     ifdef LIBNAME
       LIBFILE = lib$(LIBNAME).dylib
-      #LDLIBS  += -l$(LIBNAME)
     endif
-    LDFLAGS += -L$(KALDILIBDIR) -Wl,-rpath -Wl,$(KALDILIBDIR)
-    XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))/lib$(notdir $(basename $(dep))).dylib )
-    XLDLIBS += $(foreach dep,$(ADDLIBS), -l$(notdir $(basename $(dep))) )
-  else
-    ifeq ($(shell uname), Linux)
-      ifdef LIBNAME
-        LIBFILE = lib$(LIBNAME).so
-        #LDLIBS  += -l$(LIBNAME)
-      endif
-      LDFLAGS += -Wl,-rpath=$(shell readlink -f $(KALDILIBDIR)) -L.
-      LDFLAGS += $(foreach dep,$(ADDLIBS), -L$(dir $(dep)) )
-      XDEPENDS = $(foreach dep,$(ADDLIBS), $(dir $(dep))/lib$(notdir $(basename $(dep))).so )
-    else  # Platform not supported
-      $(error Dynamic libraries not supported on this platform. Run configure with --static flag. )
+    LDFLAGS += -Wl,-rpath -Wl,$(KALDILIBDIR)
+    EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).dylib)
+  else ifeq ($(shell uname), Linux)
+    ifdef LIBNAME
+      LIBFILE = lib$(LIBNAME).so
     endif
+    LDFLAGS += -Wl,-rpath=$(shell readlink -f $(KALDILIBDIR))
+    EXTRA_LDLIBS += $(foreach dep,$(ADDLIBS), $(dir $(dep))lib$(notdir $(basename $(dep))).so)
+  else  # Platform not supported
+    $(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
   endif
-  LDLIBS  += $(foreach dep,$(ADDLIBS), -l$(notdir $(basename $(dep))) )
 else
   ifdef LIBNAME
     LIBFILE = $(LIBNAME).a
@@ -39,23 +31,27 @@ $(LIBFILE): $(OBJFILES)
 	$(RANLIB) $(LIBNAME).a
 ifeq ($(KALDI_FLAVOR), dynamic)
 ifeq ($(shell uname), Darwin)
-	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ -framework Accelerate $(LDFLAGS) $(XLDLIBS) $(OBJFILES) $(LDLIBS)
+	$(CXX) -dynamiclib -o $@ -install_name @rpath/$@ $(LDFLAGS) $(OBJFILES) $(LDLIBS)
 	rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
-else
-ifeq ($(shell uname), Linux)
+else ifeq ($(shell uname), Linux)
 	# Building shared library from static (static was compiled with -fPIC)
-	$(CXX) -shared -o $@ -Wl,--no-undefined -Wl,--as-needed  -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive  $(LDFLAGS) $(XDEPENDS) $(LDLIBS)
+	$(CXX) -shared -o $@ -Wl,--no-undefined -Wl,--as-needed  -Wl,-soname=$@,--whole-archive $(LIBNAME).a -Wl,--no-whole-archive $(LDFLAGS) $(LDLIBS)
 	rm -f $(KALDILIBDIR)/$@; ln -s $(shell pwd)/$@ $(KALDILIBDIR)/$@
-	#cp $@ $(KALDILIBDIR)
 else  # Platform not supported
-	$(error Dynamic libraries not supported on this platform. Run configure with --static flag. )
-endif
+	$(error Dynamic libraries not supported on this platform. Run configure with --static flag.)
 endif
 endif
 
+# By default (GNU) make uses the C compiler $(CC) for linking object files even
+# if they were compiled from a C++ source. Below redefinition forces make to
+# use the C++ compiler $(CXX) instead.
+LINK.o = $(CXX) $(LDFLAGS) $(TARGET_ARCH)
 
+ifeq ($(KALDI_FLAVOR), dynamic)
+$(BINFILES): $(LIBFILE)
+else
 $(BINFILES): $(LIBFILE) $(XDEPENDS)
-
+endif
 
 # Rule below would expand to, e.g.:
 # ../base/kaldi-base.a:
@@ -73,7 +69,11 @@ clean:
 distclean: clean
 	-rm -f .depend.mk
 
+ifeq ($(KALDI_FLAVOR), dynamic)
+$(TESTFILES): $(LIBFILE)
+else
 $(TESTFILES): $(LIBFILE) $(XDEPENDS)
+endif
 
 test_compile: $(TESTFILES)
 
@@ -100,8 +100,20 @@ test: test_compile
 	done;				\
 	exit $$result; }
 
-.valgrind: $(BINFILES) $(TESTFILES)
+# Rules that enable valgrind debugging ("make valgrind")
+
+valgrind: .valgrind
 
+.valgrind: $(TESTFILES)
+	echo -n > valgrind.out
+	for x in $(TESTFILES); do \
+		echo $$x >>valgrind.out; \
+		valgrind ./$$x >/dev/null 2>> valgrind.out; \
+	done
+	! ( grep 'ERROR SUMMARY' valgrind.out | grep -v '0 errors' )
+	! ( grep 'definitely lost' valgrind.out | grep -v -w 0 )
+	rm valgrind.out
+	touch .valgrind
 
 depend:
 	-$(CXX) -M $(CXXFLAGS) *.cc > .depend.mk
diff --git a/src/makefiles/linux_atlas.mk b/src/makefiles/linux_atlas.mk
index a0b757ed39a..32a7f43fa50 100644
--- a/src/makefiles/linux_atlas.mk
+++ b/src/makefiles/linux_atlas.mk
@@ -1,37 +1,39 @@
-# You have to make sure ATLASLIBS is set...
+# ATLAS specific Linux configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
 endif
-
 ifndef ATLASINC
 $(error ATLASINC not defined.)
 endif
-
 ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. \
-	   -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_ATLAS -I$(ATLASINC) \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
+           -msse -msse2 -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_atlas_arm.mk b/src/makefiles/linux_atlas_arm.mk
index 07d9e9f3385..4c83ce71d6c 100644
--- a/src/makefiles/linux_atlas_arm.mk
+++ b/src/makefiles/linux_atlas_arm.mk
@@ -1,37 +1,39 @@
-# You have to make sure ATLASLIBS is set...
+# ATLAS specific Linux ARM configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
 endif
-
 ifndef ATLASINC
 $(error ATLASINC not defined.)
 endif
-
 ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. \
-	   -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_ATLAS -I$(ATLASINC) \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_atlas_ppc64le.mk b/src/makefiles/linux_atlas_ppc64le.mk
index 234a3794721..1e4194c2869 100644
--- a/src/makefiles/linux_atlas_ppc64le.mk
+++ b/src/makefiles/linux_atlas_ppc64le.mk
@@ -1,37 +1,40 @@
-# You have to make sure ATLASLIBS is set...
+# ATLAS specific Linux ppc64le configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
 endif
-
 ifndef ATLASINC
 $(error ATLASINC not defined.)
 endif
-
 ifndef ATLASLIBS
 $(error ATLASLIBS not defined.)
 endif
 
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -m64 -maltivec -mcpu=power8 -Wall -I.. \
-	   -mtune=power8 -mpower8-vector -mvsx -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_ATLAS -I$(ATLASINC) \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(ATLASINC) \
+           -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
+           -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_clapack.mk b/src/makefiles/linux_clapack.mk
index 83ec0ddce82..75a514a85d7 100644
--- a/src/makefiles/linux_clapack.mk
+++ b/src/makefiles/linux_clapack.mk
@@ -1,23 +1,33 @@
-# You have to make sure CLAPACKLIBS is set...
+# CLAPACK specific Linux configuration
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) -msse2 \
-      -Wno-sign-compare -Wno-unused-local-typedefs \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK -I ../../tools/CLAPACK \
-      -I ../../tools/openfst/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
+           -msse -msse2 -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_clapack_arm.mk b/src/makefiles/linux_clapack_arm.mk
index 94e6ee25bf1..52a2a663eb7 100644
--- a/src/makefiles/linux_clapack_arm.mk
+++ b/src/makefiles/linux_clapack_arm.mk
@@ -1,23 +1,33 @@
-# You have to make sure CLAPACKLIBS is set...
+# CLAPACK specific Linux ARM configuration
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_CLAPACK -I ../../tools/CLAPACK \
-      -I ../../tools/openfst/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_CLAPACK -I../../tools/CLAPACK \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_openblas.mk b/src/makefiles/linux_openblas.mk
index 7a4e2687664..1da16117a68 100644
--- a/src/makefiles/linux_openblas.mk
+++ b/src/makefiles/linux_openblas.mk
@@ -1,37 +1,39 @@
-# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
+# OpenBLAS specific Linux configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
 endif
-
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
-endif
-
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -msse -msse2 -Wall -I.. \
-           -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
-      -I $(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -msse -msse2 -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl 
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas_arm.mk b/src/makefiles/linux_openblas_arm.mk
index e4c18e6b4d4..7f462925c74 100644
--- a/src/makefiles/linux_openblas_arm.mk
+++ b/src/makefiles/linux_openblas_arm.mk
@@ -1,37 +1,39 @@
-# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
+# OpenBLAS specific Linux ARM configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
 endif
-
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
-endif
-
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -ftree-vectorize -mfloat-abi=hard -mfpu=neon -Wall -I.. \
-           -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
-      -I $(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID 
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -ftree-vectorize -mfloat-abi=hard -mfpu=neon -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
-LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl 
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
+LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
diff --git a/src/makefiles/linux_openblas_ppc64le.mk b/src/makefiles/linux_openblas_ppc64le.mk
index 222551f3bab..c098b9d92e8 100644
--- a/src/makefiles/linux_openblas_ppc64le.mk
+++ b/src/makefiles/linux_openblas_ppc64le.mk
@@ -1,37 +1,41 @@
-# You have to make sure FSTROOT,OPENBLASROOT,OPENBLASLIBS are set...
+# OpenBLAS specific Linux configuration
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
+ifndef OPENBLASINC
+$(error OPENBLASINC not defined.)
 endif
-
 ifndef OPENBLASLIBS
 $(error OPENBLASLIBS not defined.)
 endif
 
-ifndef OPENBLASROOT
-$(error OPENBLASROOT not defined.)
-endif
-
-
-DOUBLE_PRECISION = 0
-CXXFLAGS = -m64 -maltivec -mcpu=power8 -Wall -I.. \
-           -mtune=power8 -mpower8-vector -mvsx -pthread \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_OPENBLAS -I $(OPENBLASROOT)/include \
-      -I $(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_OPENBLAS -I$(OPENBLASINC) \
+           -m64 -maltivec -mcpu=power8 -mtune=power8 -mpower8-vector -mvsx \
+           -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
+
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(OPENBLASLIBS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/makefiles/linux_x86_64_mkl.mk b/src/makefiles/linux_x86_64_mkl.mk
index 7186f4bbb88..26d22253d08 100644
--- a/src/makefiles/linux_x86_64_mkl.mk
+++ b/src/makefiles/linux_x86_64_mkl.mk
@@ -1,38 +1,48 @@
-# You have to make sure MKLROOT and (optionally) MKLLIB is set
+# MKL specific Linux configuration
 
-# We have tested Kaldi with MKL version 10.2 on Linux/GCC and Intel(R) 64 
+# We have tested Kaldi with MKL version 10.2 on Linux/GCC and Intel(R) 64
 # architecture (also referred to as x86_64) with LP64 interface layer.
 
-# The linking flags for MKL will be very different depending on the OS, 
+# The linking flags for MKL will be very different depending on the OS,
 # architecture, compiler, etc. used. The correct flags can be obtained from
 # http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/
 # Use the options obtained from this website to manually configure for other
 # platforms using MKL.
 
+ifndef DOUBLE_PRECISION
+$(error DOUBLE_PRECISION not defined.)
+endif
+ifndef OPENFSTINC
+$(error OPENFSTINC not defined.)
+endif
+ifndef OPENFSTLIBS
+$(error OPENFSTLIBS not defined.)
+endif
 ifndef MKLROOT
 $(error MKLROOT not defined.)
 endif
 
-ifndef FSTROOT
-$(error FSTROOT not defined.)
-endif
-
 MKLLIB ?= $(MKLROOT)/lib/em64t
 
-DOUBLE_PRECISION = 0
-CXXFLAGS = -m64 -msse -msse2 -pthread -Wall -I.. \
-      -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
-      -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \
-      -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \
-      -DHAVE_MKL -I$(MKLROOT)/include \
-      -I$(FSTROOT)/include \
-      $(EXTRA_CXXFLAGS) \
-      -g # -O0 -DKALDI_PARANOID
+CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \
+           -Wall -Wno-sign-compare -Wno-unused-local-typedefs \
+           -Wno-deprecated-declarations -Winit-self \
+           -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \
+           -DHAVE_EXECINFO_H=1 -DHAVE_CXXABI_H -DHAVE_MKL -I$(MKLROOT)/include \
+           -m64 -msse -msse2 -pthread \
+           -g # -O0 -DKALDI_PARANOID
 
 ifeq ($(KALDI_FLAVOR), dynamic)
 CXXFLAGS += -fPIC
 endif
 
+# Compiler specific flags
+COMPILER = $(shell $(CXX) -v 2>&1)
+ifeq ($(findstring clang,$(COMPILER)),clang)
+# Suppress annoying clang warnings that are perfectly valid per spec.
+CXXFLAGS += -Wno-mismatched-tags
+endif
+
 ## Use the following for STATIC LINKING of the SEQUENTIAL version of MKL
 MKL_STA_SEQ = $(MKLLIB)/libmkl_solver_lp64_sequential.a -Wl,--start-group \
 	$(MKLLIB)/libmkl_intel_lp64.a $(MKLLIB)/libmkl_sequential.a \
@@ -53,10 +63,5 @@ MKL_DYN_MUL = -L$(MKLLIB) -lmkl_solver_lp64 -Wl,--start-group -lmkl_intel_lp64 \
 
 # MKLFLAGS = $(MKL_DYN_MUL)
 
-LDFLAGS = -rdynamic $(OPENFSTLDFLAGS)
+LDFLAGS = $(EXTRA_LDFLAGS) $(OPENFSTLDFLAGS) -rdynamic
 LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(MKLFLAGS) -lm -lpthread -ldl
-CC = g++
-CXX = g++
-AR = ar
-AS = as
-RANLIB = ranlib
diff --git a/src/matrix/Matrix.vcxproj b/src/matrix/Matrix.vcxproj
old mode 100755
new mode 100644
diff --git a/src/matrix/compressed-matrix.h b/src/matrix/compressed-matrix.h
index 603134ab800..4e4238c43da 100644
--- a/src/matrix/compressed-matrix.h
+++ b/src/matrix/compressed-matrix.h
@@ -47,7 +47,7 @@ class CompressedMatrix {
   CompressedMatrix(): data_(NULL) { }
 
   ~CompressedMatrix() { Clear(); }
-  
+
   template<typename Real>
   CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }
 
@@ -73,7 +73,7 @@ class CompressedMatrix {
 
   template<typename Real>
   CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
-  
+
   /// Copies contents to matrix.  Note: mat must have the correct size.
   /// kNoTrans case uses a temporary.
   template<typename Real>
@@ -81,7 +81,7 @@ class CompressedMatrix {
                  MatrixTransposeType trans = kNoTrans) const;
 
   void Write(std::ostream &os, bool binary) const;
-  
+
   void Read(std::istream &is, bool binary);
 
   /// Returns number of rows (or zero for emtpy matrix).
@@ -113,7 +113,7 @@ class CompressedMatrix {
   void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); }
 
   void Clear();
-  
+
   friend class Matrix<float>;
   friend class Matrix<double>;
  private:
@@ -163,7 +163,7 @@ class CompressedMatrix {
   static inline float CharToFloat(float p0, float p25,
                                   float p75, float p100,
                                   unsigned char value);
-  
+
   void *data_; // first GlobalHeader, then PerColHeader (repeated), then
   // the byte data for each column (repeated).  Note: don't intersperse
   // the byte data with the PerColHeaders, because of alignment issues.
diff --git a/src/matrix/kaldi-matrix.cc b/src/matrix/kaldi-matrix.cc
index 523af1d70ec..50c23a7be63 100644
--- a/src/matrix/kaldi-matrix.cc
+++ b/src/matrix/kaldi-matrix.cc
@@ -1041,6 +1041,19 @@ template<typename Real> void MatrixBase<Real>::Max(const MatrixBase<Real> &A) {
   }
 }
 
+template<typename Real> void MatrixBase<Real>::Min(const MatrixBase<Real> &A) {
+  KALDI_ASSERT(A.NumRows() == NumRows() && A.NumCols() == NumCols());
+  for (MatrixIndexT row = 0; row < num_rows_; row++) {
+    Real *row_data = RowData(row);
+    const Real *other_row_data = A.RowData(row);
+    MatrixIndexT num_cols = num_cols_;
+    for (MatrixIndexT col = 0; col < num_cols; col++) {
+      row_data[col] = std::min(row_data[col],
+                               other_row_data[col]);
+    }
+  }
+}
+
 
 template<typename Real> void MatrixBase<Real>::Scale(Real alpha) {
   if (alpha == 1.0) return;
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index e254fcad118..25b999fe062 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -232,6 +232,8 @@ class MatrixBase {
 
   /// Set, element-by-element, *this = max(*this, A)
   void Max(const MatrixBase<Real> &A);
+  /// Set, element-by-element, *this = min(*this, A)
+  void Min(const MatrixBase<Real> &A);
 
   /// Equivalent to (*this) = (*this) * diag(scale).  Scaling
   /// each column by a scalar taken from that dimension of the vector.
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index 87237369680..057569d1182 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -1029,8 +1029,8 @@ template<typename OtherReal>
 void VectorBase<Real>::AddVec(const Real alpha, const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   // remove __restrict__ if it causes compilation problems.
-  register Real *__restrict__ data = data_;
-  register OtherReal *__restrict__ other_data = v.data_;
+  Real *__restrict__ data = data_;
+  OtherReal *__restrict__ other_data = v.data_;
   MatrixIndexT dim = dim_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
@@ -1050,8 +1050,8 @@ template<typename OtherReal>
 void VectorBase<Real>::AddVec2(const Real alpha, const VectorBase<OtherReal> &v) {
   KALDI_ASSERT(dim_ == v.dim_);
   // remove __restrict__ if it causes compilation problems.
-  register Real *__restrict__ data = data_;
-  register OtherReal *__restrict__ other_data = v.data_;
+  Real *__restrict__ data = data_;
+  OtherReal *__restrict__ other_data = v.data_;
   MatrixIndexT dim = dim_;
   if (alpha != 1.0)
     for (MatrixIndexT i = 0; i < dim; i++)
diff --git a/src/nnet/nnet-loss.cc b/src/nnet/nnet-loss.cc
index ba529fcb556..0c1bcfbe4b7 100644
--- a/src/nnet/nnet-loss.cc
+++ b/src/nnet/nnet-loss.cc
@@ -367,17 +367,41 @@ void MultiTaskLoss::Eval(const VectorBase<BaseFloat> &frame_weights,
   // allocate diff matrix,
   diff->Resize(num_frames, num_output);
 
+  /// One vector of frame_weights per loss-function,
+  /// The original frame weights are multiplied with
+  /// a mask of `defined targets' according to the 'Posterior'.
+  std::vector<Vector<BaseFloat> > frmwei_have_tgt;
+  for (int32 l = 0; l < loss_vec_.size(); l++) {
+    // copy original weights,
+    frmwei_have_tgt.push_back(Vector<BaseFloat>(frame_weights));
+    // We need to mask-out the frames for which the 'posterior' is not defined (= is empty):
+    int32 loss_beg = loss_dim_offset_[l];   // first column of loss target,
+    int32 loss_end = loss_dim_offset_[l+1]; // (last+1) column of loss target,
+    for (int32 f = 0; f < num_frames; f++) {
+      bool tgt_defined = false;
+      for (int32 p = 0; p < post[f].size(); p++) {
+        if (post[f][p].first >= loss_beg && post[f][p].first < loss_end) {
+          tgt_defined = true;
+          break;
+        }
+      }
+      if (!tgt_defined) {
+        frmwei_have_tgt[l](f) = 0.0; // set zero_weight for the frame with no targets!
+      }
+    }
+  }
+
   // call the vector of loss functions,
   CuMatrix<BaseFloat> diff_aux;
-  for (int32 i = 0; i < loss_vec_.size(); i++) {
-    loss_vec_[i]->Eval(frame_weights,
-      net_out.ColRange(loss_dim_offset_[i], loss_dim_[i]),
-      tgt_mat_.ColRange(loss_dim_offset_[i], loss_dim_[i]),
+  for (int32 l = 0; l < loss_vec_.size(); l++) {
+    loss_vec_[l]->Eval(frmwei_have_tgt[l],
+      net_out.ColRange(loss_dim_offset_[l], loss_dim_[l]),
+      tgt_mat_.ColRange(loss_dim_offset_[l], loss_dim_[l]),
       &diff_aux);
     // Scale the gradients,
-    diff_aux.Scale(loss_weights_[i]);
+    diff_aux.Scale(loss_weights_[l]);
     // Copy to diff,
-    diff->ColRange(loss_dim_offset_[i], loss_dim_[i]).CopyFromMat(diff_aux);
+    diff->ColRange(loss_dim_offset_[l], loss_dim_[l]).CopyFromMat(diff_aux);
   }
 }
 
diff --git a/src/nnet/nnet-loss.h b/src/nnet/nnet-loss.h
index 1e0558f1b39..56bd9ac0222 100644
--- a/src/nnet/nnet-loss.h
+++ b/src/nnet/nnet-loss.h
@@ -90,6 +90,7 @@ class Xent : public LossItf {
 
   /// Get loss value (frame average),
   BaseFloat AvgLoss() {
+    if (frames_.Sum() == 0) return 0.0;
     return (xentropy_.Sum() - entropy_.Sum()) / frames_.Sum();
   }
 
@@ -151,6 +152,7 @@ class Mse : public LossItf {
 
   /// Get loss value (frame average),
   BaseFloat AvgLoss() {
+    if (frames_ == 0) return 0.0;
     return loss_ / frames_;
   }
 
diff --git a/src/nnet/nnet-parametric-relu.h b/src/nnet/nnet-parametric-relu.h
old mode 100755
new mode 100644
diff --git a/src/nnet/nnet-utils.h b/src/nnet/nnet-utils.h
index 869bb174f23..8b1afbbed3b 100644
--- a/src/nnet/nnet-utils.h
+++ b/src/nnet/nnet-utils.h
@@ -243,7 +243,7 @@ inline void BuildIntegerVector(const std::vector<std::vector<int32> >& in,
   // loop over records,
   for (int32 i = 0; i < in.size(); i++) {
     // process i'th record,
-    int32 beg, end, step;
+    int32 beg = 0, end = 0, step = 1;
     switch (in[i].size()) {
       case 1:
         beg  = in[i][0];
diff --git a/src/nnet/nnet-various.h b/src/nnet/nnet-various.h
index ddd370eaeff..eeef9bc25bf 100644
--- a/src/nnet/nnet-various.h
+++ b/src/nnet/nnet-various.h
@@ -389,7 +389,7 @@ class AddShift : public UpdatableComponent {
     shift_data_.AddVec(-lr * learn_rate_coef_, shift_data_grad_);
   }
 
-  void SetLearnRateCoef(float c) { learn_rate_coef_ = c; }
+  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
 
  protected:
   CuVector<BaseFloat> shift_data_;
@@ -505,7 +505,7 @@ class Rescale : public UpdatableComponent {
     scale_data_.AddVec(-lr * learn_rate_coef_, scale_data_grad_);
   }
 
-  void SetLearnRateCoef(float c) { learn_rate_coef_ = c; }
+  void SetLearnRateCoef(BaseFloat c) { learn_rate_coef_ = c; }
 
  protected:
   CuVector<BaseFloat> scale_data_;
diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc
index 4589ef52aa7..04e476c01bd 100644
--- a/src/nnet2/nnet-component-test.cc
+++ b/src/nnet2/nnet-component-test.cc
@@ -856,9 +856,9 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet2;
 
-
-  for (int32 loop = 0; loop < 2; loop++) {
+  int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     //// Uncomment the following line to expose the bug in UnitTestDropoutComponent
     //CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
@@ -868,7 +868,9 @@ int main() {
 #endif
 
     BasicDebugTestForSpliceMax(true);
-    for (int32 i = 0; i < 3; i++) {
+    // We used to test this 3 times, but now that nnet2 is rarely changed,
+    // reducing it to once.
+    for (int32 i = 0; i < 1; i++) {
       UnitTestGenericComponent<SigmoidComponent>();
       UnitTestGenericComponent<TanhComponent>();
       UnitTestGenericComponent<PowerComponent>("power=1.5");
@@ -905,8 +907,8 @@ int main() {
       else
         KALDI_LOG << "Tests with GPU use (if available) succeeded.";
     }
-  }
 #if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
   CuDevice::Instantiate().PrintProfile();
 #endif
   return 0;
diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc
index 27ce3111b74..8696944aa3c 100644
--- a/src/nnet2/nnet-component.cc
+++ b/src/nnet2/nnet-component.cc
@@ -597,29 +597,16 @@ row_out = f row_in.
 
 */
 
-void NormalizeComponent::Backprop(const ChunkInfo &,  // in_info,
-                                  const ChunkInfo &,  // out_info,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &out_value,
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  Component *to_update,
-                                    // may be identical to "this".
-                                  CuMatrix<BaseFloat> *in_deriv) const  {
+void NormalizeComponent::Backprop(
+    const ChunkInfo &,  // in_info,
+    const ChunkInfo &,  // out_info,
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv, Component *to_update,
+    // may be identical to "this".
+    CuMatrix<BaseFloat> *in_deriv) const {
   in_deriv->Resize(out_deriv.NumRows(), out_deriv.NumCols());
-
-  CuVector<BaseFloat> in_norm(in_value.NumRows());
-  in_norm.AddDiagMat2(1.0 / in_value.NumCols(),
-                      in_value, kNoTrans, 0.0);
-  in_norm.ApplyFloor(kNormFloor);
-  in_norm.ApplyPow(-0.5);
-  in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv, kNoTrans, 0.0);
-  in_norm.ReplaceValue(1.0 / sqrt(kNormFloor), 0.0);
-  in_norm.ApplyPow(3.0);
-  CuVector<BaseFloat> dot_products(in_deriv->NumRows());
-  dot_products.AddDiagMatMat(1.0, out_deriv, kNoTrans, in_value, kTrans, 0.0);
-  dot_products.MulElements(in_norm);
-
-  in_deriv->AddDiagVecMat(-1.0 / in_value.NumCols(), dot_products, in_value, kNoTrans, 1.0);
+  cu::DiffNormalizePerRow(in_value, out_deriv, BaseFloat(1), false, in_deriv);
 }
 
 void SigmoidComponent::Propagate(const ChunkInfo &in_info,
diff --git a/src/nnet2/online-nnet2-decodable.cc b/src/nnet2/online-nnet2-decodable.cc
index 856326cf688..715e1cc280d 100644
--- a/src/nnet2/online-nnet2-decodable.cc
+++ b/src/nnet2/online-nnet2-decodable.cc
@@ -80,7 +80,7 @@ int32 DecodableNnet2Online::NumFramesReady() const {
 
 void DecodableNnet2Online::ComputeForFrame(int32 frame) {
   int32 features_ready = features_->NumFramesReady();
-  bool input_finished = features_->IsLastFrame(features_ready - 1);  
+  bool input_finished = features_->IsLastFrame(features_ready - 1);
   KALDI_ASSERT(frame >= 0);
   if (frame >= begin_frame_ &&
       frame < begin_frame_ + scaled_loglikes_.NumRows())
@@ -112,20 +112,20 @@ void DecodableNnet2Online::ComputeForFrame(int32 frame) {
       t_modified = features_ready - 1;
     features_->GetFrame(t_modified, &row);
   }
-  CuMatrix<BaseFloat> cu_features; 
+  CuMatrix<BaseFloat> cu_features;
   cu_features.Swap(&features);  // Copy to GPU, if we're using one.
-  
+
 
   int32 num_frames_out = input_frame_end - input_frame_begin -
       left_context_ - right_context_;
-  
+
   CuMatrix<BaseFloat> cu_posteriors(num_frames_out, num_pdfs_);
-  
+
   // The "false" below tells it not to pad the input: we've already done
   // any padding that we needed to do.
   NnetComputation(nnet_.GetNnet(), cu_features,
                   false, &cu_posteriors);
-  
+
   cu_posteriors.ApplyFloor(1.0e-20); // Avoid log of zero which leads to NaN.
   cu_posteriors.ApplyLog();
   // subtract log-prior (divide by prior)
diff --git a/src/nnet2bin/nnet-align-compiled.cc b/src/nnet2bin/nnet-align-compiled.cc
index 60045eb7cce..8f5537c26c7 100644
--- a/src/nnet2bin/nnet-align-compiled.cc
+++ b/src/nnet2bin/nnet-align-compiled.cc
@@ -40,12 +40,13 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Align features given neural-net-based model\n"
-        "Usage:   nnet-align-compiled [options] model-in graphs-rspecifier feature-rspecifier alignments-wspecifier\n"
+        "Usage:   nnet-align-compiled [options] model-in graphs-rspecifier "
+        "feature-rspecifier alignments-wspecifier\n"
         "e.g.: \n"
         " nnet-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
-        "   nnet-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "   ark:- | nnet-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
 
     ParseOptions po(usage);
     AlignConfig align_config;
diff --git a/src/nnet2bin/nnet-am-average.cc b/src/nnet2bin/nnet-am-average.cc
index 0fa00f05995..d35375f44f2 100644
--- a/src/nnet2bin/nnet-am-average.cc
+++ b/src/nnet2bin/nnet-am-average.cc
@@ -29,7 +29,7 @@ namespace kaldi {
 
 void GetWeights(const std::string &weights_str,
                 int32 num_inputs,
-                vector<BaseFloat> *weights) {
+                std::vector<BaseFloat> *weights) {
   KALDI_ASSERT(num_inputs >= 1);
   if (!weights_str.empty()) {
     SplitStringToFloats(weights_str, ":", true, weights);
@@ -169,7 +169,7 @@ int main(int argc, char *argv[]) {
 
     int32 num_inputs = po.NumArgs() - 1;
 
-    vector<BaseFloat> model_weights;
+    std::vector<BaseFloat> model_weights;
     GetWeights(weights_str, num_inputs, &model_weights);
 
     int32 c_begin = 0,
@@ -179,7 +179,7 @@ int main(int argc, char *argv[]) {
     KALDI_ASSERT(c_end != -1 && "Network has no updatable components.");
 
     int32 last_layer_idx = am_nnet1.GetNnet().NumComponents();
-    vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
+    std::vector<bool> skip_layers = GetSkipLayers(skip_layers_str,
                                              0,
                                              last_layer_idx);
 
@@ -257,4 +257,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/nnet2bin/nnet-normalize-stddev.cc b/src/nnet2bin/nnet-normalize-stddev.cc
index 29e3cf8fb80..b23faef5fc1 100644
--- a/src/nnet2bin/nnet-normalize-stddev.cc
+++ b/src/nnet2bin/nnet-normalize-stddev.cc
@@ -47,13 +47,13 @@ int main(int argc, char *argv[]) {
     bool binary_write = true;
     BaseFloat stddev = 1.0;
     std::string reference_model_filename;
-    
+
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
     po.Register("stddev-from", &reference_model_filename, "Reference model");
     po.Register("stddev", &stddev, "Target standard deviation that we normalize "
                 "to (note: is overridden by --stddev-from option, if supplied)");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]) {
 
     // Works out the layers that we would like to normalize: any affine or block
     // affine layers that are followed by pnorm and then renormalize layers.
-    vector<int32> identified_components;
+    std::vector<int32> identified_components;
     for (int32 c = 0; c < am_nnet.GetNnet().NumComponents() - 2; c++) {
       // Checks if the current layer is an affine layer or block affine layer.
       // Also includes PreconditionedAffineComponent and
@@ -89,13 +89,13 @@ int main(int argc, char *argv[]) {
         dynamic_cast<BlockAffineComponent*>(component);
       if (ac == NULL && bac == NULL)
         continue;
-      
+
       // Checks if the next layer is a pnorm layer.
       component = &(am_nnet.GetNnet().GetComponent(c + 1));
       PnormComponent *pc = dynamic_cast<PnormComponent*>(component);
       if (pc == NULL)
         continue;
-      
+
       // Checks if the layer after the pnorm layer is a NormalizeComponent
       // or a PowerComponent followed by a NormalizeComponent
       component = &(am_nnet.GetNnet().GetComponent(c + 2));
@@ -126,7 +126,7 @@ int main(int argc, char *argv[]) {
     }
 
     BaseFloat ref_stddev = 0.0;
-    
+
     // Normalizes the identified layers.
     for (int32 c = 0; c < identified_components.size(); c++) {
       ref_stddev = stddev;
@@ -150,7 +150,7 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(uc != NULL);
       Vector<BaseFloat> params(uc->GetParameterDim());
       uc->Vectorize(&params);
-      BaseFloat params_average = params.Sum() 
+      BaseFloat params_average = params.Sum()
           / static_cast<BaseFloat>(params.Dim());
       params.Add(-1.0 * params_average);
       BaseFloat params_stddev = sqrt(VecVec(params, params)
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 65384f5a338..76e0cbbdfbb 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -28,7 +28,8 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \
-  online-nnet3-decodable-simple.o
+  nnet-compile-looped.o decodable-simple-looped.o \
+  decodable-online-looped.o
 
 
 LIBNAME = kaldi-nnet3
@@ -37,6 +38,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/am-nnet-simple.h b/src/nnet3/am-nnet-simple.h
index 5178c2a054d..c3d8301aa5a 100644
--- a/src/nnet3/am-nnet-simple.h
+++ b/src/nnet3/am-nnet-simple.h
@@ -94,7 +94,7 @@ class AmNnetSimple {
   /// This function works out the left_context_ and right_context_ variables
   /// from the network (it's a rather complex calculation).  You should call
   /// this if you have structurally changed the nnet without calling SetNnet(),
-  /// e.g. using non-const GetNnet().  void SetContext();
+  /// e.g. using non-const GetNnet().
   void SetContext();
  private:
 
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
new file mode 100644
index 00000000000..f231a2d5b62
--- /dev/null
+++ b/src/nnet3/decodable-online-looped.cc
@@ -0,0 +1,252 @@
+// nnet3/decodable-online-looped.cc
+
+// Copyright  2017  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <nnet3/decodable-online-looped.h>
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
+    const DecodableNnetSimpleLoopedInfo &info,
+    OnlineFeatureInterface *input_features,
+    OnlineFeatureInterface *ivector_features):
+    num_chunks_computed_(0),
+    current_log_post_subsampled_offset_(-1),
+    info_(info),
+    input_features_(input_features),
+    ivector_features_(ivector_features),
+    computer_(info_.opts.compute_config, info_.computation,
+              info_.nnet, NULL) {   // NULL is 'nnet_to_update'
+  // Check that feature dimensions match.
+  KALDI_ASSERT(input_features_ != NULL);
+  int32 nnet_input_dim = info_.nnet.InputDim("input"),
+      nnet_ivector_dim = info_.nnet.InputDim("ivector"),
+        feat_input_dim = input_features_->Dim(),
+      feat_ivector_dim = (ivector_features_ != NULL ?
+                          ivector_features_->Dim() : -1);
+  if (nnet_input_dim != feat_input_dim) {
+    KALDI_ERR << "Input feature dimension mismatch: got " << feat_input_dim
+              << " but network expects " << nnet_input_dim;
+  }
+  if (nnet_ivector_dim != feat_ivector_dim) {
+    KALDI_ERR << "Ivector feature dimension mismatch: got " << feat_ivector_dim
+              << " but network expects " << nnet_ivector_dim;
+  }
+}
+
+
+int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
+  // note: the ivector_features_ may have 2 or 3 fewer frames ready than
+  // input_features_, but we don't wait for them; we just use the most recent
+  // iVector we can.
+  int32 features_ready = input_features_->NumFramesReady();
+  if (features_ready == 0)
+    return 0;
+  bool input_finished = input_features_->IsLastFrame(features_ready - 1);
+
+  int32 sf = info_.opts.frame_subsampling_factor;
+
+  if (input_finished) {
+    // if the input has finished,... we'll pad with duplicates of the last frame
+    // as needed to get the required right context.
+    return (features_ready + sf - 1) / sf;
+  } else {
+    // note: info_.right_context_ includes both the model context and any
+    // extra_right_context_ (but this
+    int32 non_subsampled_output_frames_ready =
+        std::max<int32>(0, features_ready - info_.frames_right_context);
+    int32 num_chunks_ready = non_subsampled_output_frames_ready /
+                             info_.frames_per_chunk;
+    // note: the division by the frame subsampling factor 'sf' below
+    // doesn't need any attention to rounding because info_.frames_per_chunk
+    // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
+    // in decodable-simple-looped.cc).
+    return num_chunks_ready * info_.frames_per_chunk / sf;
+  }
+}
+
+
+// note: the frame-index argument is on the output of the network, i.e. after any
+// subsampling, so we call it 'subsampled_frame'.
+bool DecodableNnetLoopedOnlineBase::IsLastFrame(
+    int32 subsampled_frame) const {
+  // To understand this code, compare it with the code of NumFramesReady(),
+  // it follows the same structure.
+  int32 features_ready = input_features_->NumFramesReady();
+  if (features_ready == 0) {
+    if (subsampled_frame == -1 && input_features_->IsLastFrame(-1)) {
+      // the attempt to handle this rather pathological case (input finished
+      // but no frames ready) is a little quixotic as we have not properly
+      // tested this and other parts of the code may die.
+      return true;
+    } else {
+      return false;
+    }
+  }
+  bool input_finished = input_features_->IsLastFrame(features_ready - 1);
+  if (!input_finished)
+    return false;
+  int32 sf = info_.opts.frame_subsampling_factor,
+     num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
+  return (subsampled_frame == num_subsampled_frames_ready - 1);
+}
+
+
+void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
+  // Prepare the input data for the next chunk of features.
+  // note: 'end' means one past the last.
+  int32 begin_input_frame, end_input_frame;
+  if (num_chunks_computed_ == 0) {
+    begin_input_frame = -info_.frames_left_context;
+    // note: end is last plus one.
+    end_input_frame = info_.frames_per_chunk + info_.frames_right_context;
+  } else {
+    // note: begin_input_frame will be the same as the previous end_input_frame.
+    // you can verify this directly if num_chunks_computed_ == 0, and then by
+    // induction.
+    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk +
+        info_.frames_right_context;
+    end_input_frame = begin_input_frame + info_.frames_per_chunk;
+  }
+
+  int32 num_feature_frames_ready = input_features_->NumFramesReady();
+  bool is_finished = input_features_->IsLastFrame(num_feature_frames_ready - 1);
+
+  if (end_input_frame > num_feature_frames_ready && !is_finished) {
+    // we shouldn't be attempting to read past the end of the available features
+    // until we have reached the end of the input (i.e. the end-user called
+    // InputFinished(), announcing that there is no more waveform; at this point
+    // we pad as needed with copies of the last frame, to flush out the last of
+    // the output.
+    // If the following error happens, it likely indicates a bug in this
+    // decodable code somewhere (although it could possibly indicate the
+    // user asking for a frame that was not ready, which would be a misuse
+    // of this class.. it can be figured out from gdb as in either case it
+    // would be a bug in the code.
+    KALDI_ERR << "Attempt to access frame past the end of the available input";
+  }
+
+
+  CuMatrix<BaseFloat> feats_chunk;
+  { // this block sets 'feats_chunk'.
+    Matrix<BaseFloat> this_feats(end_input_frame - begin_input_frame,
+                                 input_features_->Dim());
+    for (int32 i = begin_input_frame; i < end_input_frame; i++) {
+      SubVector<BaseFloat> this_row(this_feats, i - begin_input_frame);
+      int32 input_frame = i;
+      if (input_frame < 0) input_frame = 0;
+      if (input_frame >= num_feature_frames_ready)
+        input_frame = num_feature_frames_ready - 1;
+      input_features_->GetFrame(input_frame, &this_row);
+    }
+    feats_chunk.Swap(&this_feats);
+  }
+  computer_.AcceptInput("input", &feats_chunk);
+
+  if (info_.has_ivectors) {
+    KALDI_ASSERT(ivector_features_ != NULL);
+    KALDI_ASSERT(info_.request1.inputs.size() == 2);
+    // all but the 1st chunk should have 1 iVector, but there is no need to
+    // assume this.
+    int32 num_ivectors = (num_chunks_computed_ == 0 ?
+			  info_.request1.inputs[1].indexes.size() :
+			  info_.request2.inputs[1].indexes.size());
+    KALDI_ASSERT(num_ivectors > 0);
+
+    Vector<BaseFloat> ivector(ivector_features_->Dim());
+    // we just get the iVector from the last input frame we needed,
+    // reduced as necessary
+    // we don't bother trying to be 'accurate' in getting the iVectors
+    // for their 'correct' frames, because in general using the
+    // iVector from as large 't' as possible will be better.
+
+    int32 most_recent_input_frame = num_feature_frames_ready - 1,
+      num_ivector_frames_ready = ivector_features_->NumFramesReady();
+
+    if (num_ivector_frames_ready > 0) {
+      int32 ivector_frame_to_use = std::min<int32>(
+          most_recent_input_frame, num_ivector_frames_ready - 1);
+      ivector_features_->GetFrame(ivector_frame_to_use,
+                                  &ivector);
+    }
+    // else just leave the iVector zero (would only happen with very small
+    // chunk-size, like a chunk size of 2 which would be very inefficient; and
+    // only at file begin.
+
+    // note: we expect num_ivectors to be 1 in practice.
+    Matrix<BaseFloat> ivectors(num_ivectors,
+			       ivector.Dim());
+    ivectors.CopyRowsFromVec(ivector);
+    CuMatrix<BaseFloat> cu_ivectors;
+    cu_ivectors.Swap(&ivectors);
+    computer_.AcceptInput("ivector", &cu_ivectors);
+  }
+  computer_.Run();
+
+  {
+    // Note: it's possible in theory that if you had weird recurrence that went
+    // directly from the output, the call to GetOutputDestructive() would cause
+    // a crash on the next chunk.  If that happens, GetOutput() should be used
+    // instead of GetOutputDestructive().  But we don't anticipate this will
+    // happen in practice.
+    CuMatrix<BaseFloat> output;
+    computer_.GetOutputDestructive("output", &output);
+
+    if (info_.log_priors.Dim() != 0) {
+      // subtract log-prior (divide by prior)
+      output.AddVecToRows(-1.0, info_.log_priors);
+    }
+    // apply the acoustic scale
+    output.Scale(info_.opts.acoustic_scale);
+    current_log_post_.Resize(0, 0);
+    current_log_post_.Swap(&output);
+  }
+  KALDI_ASSERT(current_log_post_.NumRows() == info_.frames_per_chunk /
+               info_.opts.frame_subsampling_factor &&
+               current_log_post_.NumCols() == info_.output_dim);
+
+  num_chunks_computed_++;
+
+  current_log_post_subsampled_offset_ =
+      (num_chunks_computed_ - 1) *
+      (info_.frames_per_chunk / info_.opts.frame_subsampling_factor);
+}
+
+BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
+                                                    int32 index) {
+  EnsureFrameIsComputed(subsampled_frame);
+  // note: we index by 'inde
+  return current_log_post_(
+      subsampled_frame - current_log_post_subsampled_offset_,
+      index - 1);
+}
+
+
+BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
+                                                    int32 index) {
+  EnsureFrameIsComputed(subsampled_frame);
+  return current_log_post_(
+      subsampled_frame - current_log_post_subsampled_offset_,
+      trans_model_.TransitionIdToPdf(index));
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
new file mode 100644
index 00000000000..3041d3c4637
--- /dev/null
+++ b/src/nnet3/decodable-online-looped.h
@@ -0,0 +1,199 @@
+// nnet3/decodable-online-looped.h
+
+// Copyright  2014-2017  Johns Hopkins Universithy (author: Daniel Povey)
+//                 2016  Api.ai (Author: Ilya Platonov)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DECODABLE_ONLINE_LOOPED_H_
+#define KALDI_NNET3_DECODABLE_ONLINE_LOOPED_H_
+
+#include "itf/online-feature-itf.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/decodable-simple-looped.h"
+#include "hmm/transition-model.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+// The Decodable objects that we define in this header do the neural net
+// computation in a way that's compatible with online feature extraction.  It
+// differs from the one declared in online-nnet3-decodable-simple.h because it
+// uses the 'looped' network evaluation, which is more efficient because it
+// re-uses hidden activations (and therefore doesn't have to pad chunks of data
+// with extra left-context); it is applicable to TDNNs and to forwards-recurrent
+// topologies like LSTMs, but not tobackwards-recurrent topologies such as
+// BLSTMs.
+
+// The options are passed in the same way as in decodable-simple-looped.h,
+// we use the same options and info class.
+
+
+// This object is used as a base class for DecodableNnetLoopedOnline
+// and DecodableAmNnetLoopedOnline.
+// It takes care of the neural net computation and computations related to how
+// many frames are ready (etc.), but it does not override the LogLikelihood() or
+// NumIndices() functions so it is not usable as an object of type
+// DecodableInterface.
+class DecodableNnetLoopedOnlineBase: public DecodableInterface {
+ public:
+  // Constructor.  'input_feature' is for the feature that will be given
+  // as 'input' to the neural network; 'ivector_feature' is for the iVector
+  // feature, or NULL if iVectors are not being used.
+  DecodableNnetLoopedOnlineBase(const DecodableNnetSimpleLoopedInfo &info,
+                                 OnlineFeatureInterface *input_features,
+                                 OnlineFeatureInterface *ivector_features);
+
+  // note: the LogLikelihood function is not overridden; the child
+  // class needs to do this.
+  //virtual BaseFloat LogLikelihood(int32 subsampled_frame, int32 index);
+
+  // note: the frame argument is on the output of the network, i.e. after any
+  // subsampling, so we call it 'subsampled_frame'.
+  virtual bool IsLastFrame(int32 subsampled_frame) const;
+
+  virtual int32 NumFramesReady() const;
+
+  // Note: this function, present in the base-class, is overridden by the child class.
+  // virtual int32 NumIndices() const;
+
+  // this is not part of the standard Decodable interface but I think is needed for
+  // something.
+  int32 FrameSubsamplingFactor() const {
+    return info_.opts.frame_subsampling_factor;
+  }
+
+
+ protected:
+
+  /// If the neural-network outputs for this frame are not cached, this function
+  /// computes them (and possibly also some later frames).  Note:
+  /// the frame-index is called 'subsampled_frame' because if frame-subsampling-factor
+  /// is not 1, it's an index that is "after subsampling", i.e. it changes more
+  /// slowly than the input-feature index.
+  inline void EnsureFrameIsComputed(int32 subsampled_frame) {
+    KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
+                 "Frames must be accessed in order.");
+    while (subsampled_frame >= current_log_post_subsampled_offset_ +
+           current_log_post_.NumRows())
+      AdvanceChunk();
+  }
+
+  // The current log-posteriors that we got from the last time we
+  // ran the computation.
+  Matrix<BaseFloat> current_log_post_;
+
+  // The number of chunks we have computed so far.
+  int32 num_chunks_computed_;
+
+  // The time-offset of the current log-posteriors, equals
+  // (num_chunks_computed_ - 1) *
+  //    (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor).
+  int32 current_log_post_subsampled_offset_;
+
+  const DecodableNnetSimpleLoopedInfo &info_;
+
+ private:
+
+  // This function does the computation for the next chunk.  It will change
+  // current_log_post_ and current_log_post_subsampled_offset_, and
+  // increment num_chunks_computed_.
+  void AdvanceChunk();
+
+  OnlineFeatureInterface *input_features_;
+  OnlineFeatureInterface *ivector_features_;
+
+  NnetComputer computer_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetLoopedOnlineBase);
+};
+
+// This decodable object takes indexes of the form (pdf_id + 1),
+// or whatever the output-dimension of the neural network represents,
+// plus one.
+// It fully implements DecodableInterface.
+// Note: whether or not division by the prior takes place depends on
+// whether you supplied class AmNnetSimple (or just Nnet), to the constructor
+// of the DecodableNnetSimpleLoopedInfo that you initailized this
+// with.
+class DecodableNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
+ public:
+  DecodableNnetLoopedOnline(
+      const DecodableNnetSimpleLoopedInfo &info,
+      OnlineFeatureInterface *input_features,
+      OnlineFeatureInterface *ivector_features):
+      DecodableNnetLoopedOnlineBase(info, input_features, ivector_features) { }
+
+
+  // returns the output-dim of the neural net.
+  virtual int32 NumIndices() const { return info_.output_dim; }
+
+  // 'subsampled_frame' is a frame, but if frame-subsampling-factor != 1, it's a
+  // reduced-rate output frame (e.g. a 't' index divided by 3).  'index'
+  // represents the pdf-id (or other output of the network) PLUS ONE.
+  virtual BaseFloat LogLikelihood(int32 subsampled_frame, int32 index);
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetLoopedOnline);
+
+};
+
+
+// This is for traditional decoding where the graph has transition-ids
+// on the arcs, and you need the TransitionModel to map those to
+// pdf-ids.
+// Note: whether or not division by the prior takes place depends on
+// whether you supplied class AmNnetSimple (or just Nnet), to the constructor
+// of the DecodableNnetSimpleLoopedInfo that you initailized this
+// with.
+class DecodableAmNnetLoopedOnline: public DecodableNnetLoopedOnlineBase {
+ public:
+  DecodableAmNnetLoopedOnline(
+      const TransitionModel &trans_model,
+      const DecodableNnetSimpleLoopedInfo &info,
+      OnlineFeatureInterface *input_features,
+      OnlineFeatureInterface *ivector_features):
+      DecodableNnetLoopedOnlineBase(info, input_features, ivector_features),
+      trans_model_(trans_model) { }
+
+
+  // returns the output-dim of the neural net.
+  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
+
+  // 'subsampled_frame' is a frame, but if frame-subsampling-factor != 1, it's a
+  // reduced-rate output frame (e.g. a 't' index divided by 3).
+  virtual BaseFloat LogLikelihood(int32 subsampled_frame,
+                                  int32 transition_id);
+
+ private:
+  const TransitionModel &trans_model_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetLoopedOnline);
+
+};
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_DECODABLE_ONLINE_LOOPED_H_
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
new file mode 100644
index 00000000000..df18d605b7d
--- /dev/null
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -0,0 +1,266 @@
+// nnet3/decodable-simple-looped.cc
+
+// Copyright      2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/decodable-simple-looped.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-compile-looped.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
+    const NnetSimpleLoopedComputationOptions &opts,
+    Nnet *nnet):
+    opts(opts), nnet(*nnet) {
+  Init(opts, nnet);
+}
+
+DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
+    const NnetSimpleLoopedComputationOptions &opts,
+    const Vector<BaseFloat> &priors,
+    Nnet *nnet):
+    opts(opts), nnet(*nnet), log_priors(priors) {
+  if (log_priors.Dim() != 0)
+    log_priors.ApplyLog();
+  Init(opts, nnet);
+}
+
+
+DecodableNnetSimpleLoopedInfo::DecodableNnetSimpleLoopedInfo(
+    const NnetSimpleLoopedComputationOptions &opts,
+    AmNnetSimple *am_nnet):
+    opts(opts), nnet(am_nnet->GetNnet()), log_priors(am_nnet->Priors()) {
+  if (log_priors.Dim() != 0)
+    log_priors.ApplyLog();
+  Init(opts, &(am_nnet->GetNnet()));
+}
+
+
+void DecodableNnetSimpleLoopedInfo::Init(
+    const NnetSimpleLoopedComputationOptions &opts,
+    Nnet *nnet) {
+  opts.Check();
+  KALDI_ASSERT(IsSimpleNnet(*nnet));
+  has_ivectors = (nnet->InputDim("ivector") > 0);
+  int32 left_context, right_context;
+  ComputeSimpleNnetContext(*nnet, &left_context, &right_context);
+  frames_left_context = left_context + opts.extra_left_context_initial;
+  frames_right_context = right_context;
+  frames_per_chunk = GetChunkSize(*nnet, opts.frame_subsampling_factor,
+                                  opts.frames_per_chunk);
+  output_dim = nnet->OutputDim("output");
+  KALDI_ASSERT(output_dim > 0);
+  // note, ivector_period is hardcoded to the same as frames_per_chunk_.
+  int32 ivector_period = frames_per_chunk;
+  if (has_ivectors)
+    ModifyNnetIvectorPeriod(ivector_period, nnet);
+
+  int32 num_sequences = 1;  // we're processing one utterance at a time.
+  int32 extra_right_context = 0;
+  CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk,
+                                       opts.frame_subsampling_factor,
+                                       ivector_period,
+                                       opts.extra_left_context_initial,
+                                       extra_right_context,
+                                       num_sequences,
+                                       &request1, &request2, &request3);
+
+  CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
+                &computation);
+  computation.ComputeCudaIndexes();
+  if (GetVerboseLevel() >= 3) {
+    KALDI_VLOG(3) << "Computation is:";
+    computation.Print(std::cerr, *nnet);
+  }
+}
+
+
+DecodableNnetSimpleLooped::DecodableNnetSimpleLooped(
+    const DecodableNnetSimpleLoopedInfo &info,
+    const MatrixBase<BaseFloat> &feats,
+    const VectorBase<BaseFloat> *ivector,
+    const MatrixBase<BaseFloat> *online_ivectors,
+    int32 online_ivector_period):
+    info_(info),
+    computer_(info_.opts.compute_config, info_.computation,
+              info_.nnet, NULL),  // NULL is 'nnet_to_update'
+    feats_(feats),
+    ivector_(ivector), online_ivector_feats_(online_ivectors),
+    online_ivector_period_(online_ivector_period),
+    num_chunks_computed_(0),
+    current_log_post_subsampled_offset_(-1) {
+  num_subsampled_frames_ =
+      (feats_.NumRows() + info_.opts.frame_subsampling_factor - 1) /
+      info_.opts.frame_subsampling_factor;
+  KALDI_ASSERT(!(ivector != NULL && online_ivectors != NULL));
+  KALDI_ASSERT(!(online_ivectors != NULL && online_ivector_period <= 0 &&
+                 "You need to set the --online-ivector-period option!"));
+}
+
+
+void DecodableNnetSimpleLooped::GetOutputForFrame(
+    int32 subsampled_frame, VectorBase<BaseFloat> *output) {
+    KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
+                 "Frames must be accessed in order.");
+    while (subsampled_frame >= current_log_post_subsampled_offset_ +
+                            current_log_post_.NumRows())
+      AdvanceChunk();
+    output->CopyFromVec(current_log_post_.Row(
+        subsampled_frame - current_log_post_subsampled_offset_));
+}
+
+int32 DecodableNnetSimpleLooped::GetIvectorDim() const {
+  if (ivector_ != NULL)
+    return ivector_->Dim();
+  else if (online_ivector_feats_ != NULL)
+    return online_ivector_feats_->NumCols();
+  else
+    return 0;
+}
+
+
+void DecodableNnetSimpleLooped::AdvanceChunk() {
+  int32 begin_input_frame, end_input_frame;
+  if (num_chunks_computed_ == 0) {
+    begin_input_frame = -info_.frames_left_context;
+    // note: end is last plus one.
+    end_input_frame = info_.frames_per_chunk + info_.frames_right_context;
+  } else {
+    begin_input_frame = num_chunks_computed_ * info_.frames_per_chunk +
+        info_.frames_right_context;
+    end_input_frame = begin_input_frame + info_.frames_per_chunk;
+  }
+  CuMatrix<BaseFloat> feats_chunk(end_input_frame - begin_input_frame,
+                                  feats_.NumCols(), kUndefined);
+
+  int32 num_features = feats_.NumRows();
+  if (begin_input_frame >= 0 && end_input_frame <= num_features) {
+    SubMatrix<BaseFloat> this_feats(feats_,
+                                    begin_input_frame,
+                                    end_input_frame - begin_input_frame,
+                                    0, feats_.NumCols());
+    feats_chunk.CopyFromMat(this_feats);
+  } else {
+    Matrix<BaseFloat> this_feats(end_input_frame - begin_input_frame,
+                                 feats_.NumCols());
+    for (int32 r = begin_input_frame; r < end_input_frame; r++) {
+      int32 input_frame = r;
+      if (input_frame < 0) input_frame = 0;
+      if (input_frame >= num_features) input_frame = num_features - 1;
+      this_feats.Row(r - begin_input_frame).CopyFromVec(
+          feats_.Row(input_frame));
+    }
+    feats_chunk.CopyFromMat(this_feats);
+  }
+  computer_.AcceptInput("input", &feats_chunk);
+
+  if (info_.has_ivectors) {
+    KALDI_ASSERT(info_.request1.inputs.size() == 2);
+    // all but the 1st chunk should have 1 iVector, but no need
+    // to assume this.
+    int32 num_ivectors = (num_chunks_computed_ == 0 ?
+			  info_.request1.inputs[1].indexes.size() :
+			  info_.request2.inputs[1].indexes.size());
+    KALDI_ASSERT(num_ivectors > 0);
+
+    Vector<BaseFloat> ivector;
+    // we just get the iVector from the last input frame we needed...
+    // we don't bother trying to be 'accurate' in getting the iVectors
+    // for their 'correct' frames, because in general using the
+    // iVector from as large 't' as possible will be better.
+    GetCurrentIvector(end_input_frame, &ivector);
+    Matrix<BaseFloat> ivectors(num_ivectors,
+			       ivector.Dim());
+    ivectors.CopyRowsFromVec(ivector);
+    CuMatrix<BaseFloat> cu_ivectors(ivectors);
+    computer_.AcceptInput("ivector", &cu_ivectors);
+  }
+  computer_.Run();
+
+  {
+    // Note: it's possible in theory that if you had weird recurrence that went
+    // directly from the output, the call to GetOutputDestructive() would cause
+    // a crash on the next chunk.  If that happens, GetOutput() should be used
+    // instead of GetOutputDestructive().  But we don't anticipate this will
+    // happen in practice.
+    CuMatrix<BaseFloat> output;
+    computer_.GetOutputDestructive("output", &output);
+
+    if (info_.log_priors.Dim() != 0) {
+      // subtract log-prior (divide by prior)
+      output.AddVecToRows(-1.0, info_.log_priors);
+    }
+    // apply the acoustic scale
+    output.Scale(info_.opts.acoustic_scale);
+    current_log_post_.Resize(0, 0);
+    current_log_post_.Swap(&output);
+  }
+  KALDI_ASSERT(current_log_post_.NumRows() == info_.frames_per_chunk /
+               info_.opts.frame_subsampling_factor &&
+               current_log_post_.NumCols() == info_.output_dim);
+
+  num_chunks_computed_++;
+
+  current_log_post_subsampled_offset_ =
+      (num_chunks_computed_ - 1) *
+      (info_.frames_per_chunk / info_.opts.frame_subsampling_factor);
+}
+
+
+void DecodableNnetSimpleLooped::GetCurrentIvector(int32 input_frame,
+                                                  Vector<BaseFloat> *ivector) {
+  if (!info_.has_ivectors)
+    return;
+  if (ivector_ != NULL) {
+    *ivector = *ivector_;
+    return;
+  } else if (online_ivector_feats_ == NULL) {
+    KALDI_ERR << "Neural net expects iVectors but none provided.";
+  }
+  KALDI_ASSERT(online_ivector_period_ > 0);
+  int32 ivector_frame = input_frame / online_ivector_period_;
+  KALDI_ASSERT(ivector_frame >= 0);
+  if (ivector_frame >= online_ivector_feats_->NumRows())
+    ivector_frame = online_ivector_feats_->NumRows() - 1;
+  KALDI_ASSERT(ivector_frame >= 0 && "ivector matrix cannot be empty.");
+  *ivector = online_ivector_feats_->Row(ivector_frame);
+}
+
+
+DecodableAmNnetSimpleLooped::DecodableAmNnetSimpleLooped(
+    const DecodableNnetSimpleLoopedInfo &info,
+    const TransitionModel &trans_model,
+    const MatrixBase<BaseFloat> &feats,
+    const VectorBase<BaseFloat> *ivector,
+    const MatrixBase<BaseFloat> *online_ivectors,
+    int32 online_ivector_period):
+    decodable_nnet_(info, feats, ivector, online_ivectors, online_ivector_period),
+    trans_model_(trans_model) { }
+
+BaseFloat DecodableAmNnetSimpleLooped::LogLikelihood(int32 frame,
+                                                     int32 transition_id) {
+  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  return decodable_nnet_.GetOutput(frame, pdf_id);
+}
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/decodable-simple-looped.h b/src/nnet3/decodable-simple-looped.h
new file mode 100644
index 00000000000..ca3f732641e
--- /dev/null
+++ b/src/nnet3/decodable-simple-looped.h
@@ -0,0 +1,329 @@
+// nnet3/decodable-simple-looped.h
+
+// Copyright 2016 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
+#define KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
+
+#include <vector>
+#include "base/kaldi-common.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/am-nnet-simple.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+// See also nnet-am-decodable-simple.h, which is a decodable object that's based
+// on breaking up the input into fixed chunks.  The decodable object defined here is based on
+// 'looped' computations, which naturally handle infinite left-context (but are
+// only ideal for systems that have only recurrence in the forward direction,
+// i.e. not BLSTMs... because there isn't a natural way to enforce extra right
+// context for each chunk.)
+
+
+// Note: the 'simple' in the name means it applies to networks for which
+// IsSimpleNnet(nnet) would return true.  'looped' means we use looped
+// computations, with a kGotoLabel statement at the end of it.
+struct NnetSimpleLoopedComputationOptions {
+  int32 extra_left_context_initial;
+  int32 frame_subsampling_factor;
+  int32 frames_per_chunk;
+  BaseFloat acoustic_scale;
+  bool debug_computation;
+  NnetOptimizeOptions optimize_config;
+  NnetComputeOptions compute_config;
+  NnetSimpleLoopedComputationOptions():
+      extra_left_context_initial(0),
+      frame_subsampling_factor(1),
+      frames_per_chunk(20),
+      acoustic_scale(0.1),
+      debug_computation(false) { }
+
+  void Check() const {
+    KALDI_ASSERT(extra_left_context_initial >= 0 &&
+                 frame_subsampling_factor > 0 && frames_per_chunk > 0 &&
+                 acoustic_scale > 0.0);
+  }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("extra-left-context-initial", &extra_left_context_initial,
+                   "Extra left context to use at the first frame of an utterance (note: "
+                   "this will just consist of repeats of the first frame, and should not "
+                   "usually be necessary.");
+    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
+                   "Required if the frame-rate of the output (e.g. in 'chain' "
+                   "models) is less than the frame-rate of the original "
+                   "alignment.");
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic log-likelihoods");
+    opts->Register("frames-per-chunk", &frames_per_chunk,
+                   "Number of frames in each chunk that is separately evaluated "
+                   "by the neural net.  Measured before any subsampling, if the "
+                   "--frame-subsampling-factor options is used (i.e. counts "
+                   "input frames.  This is only advisory (may be rounded up "
+                   "if needed.");
+    opts->Register("debug-computation", &debug_computation, "If true, turn on "
+                   "debug for the actual computation (very verbose!)");
+
+    // register the optimization options with the prefix "optimization".
+    ParseOptions optimization_opts("optimization", opts);
+    optimize_config.Register(&optimization_opts);
+
+    // register the compute options with the prefix "computation".
+    ParseOptions compute_opts("computation", opts);
+    compute_config.Register(&compute_opts);
+  }
+};
+
+
+/**
+   When you instantiate class DecodableNnetSimpleLooped, you should give it
+   a const reference to this class, that has been previously initialized.
+ */
+class DecodableNnetSimpleLoopedInfo  {
+ public:
+  // The constructor takes a non-const pointer to 'nnet' because it may have to
+  // modify it to be able to take multiple iVectors.
+  DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
+                                Nnet *nnet);
+
+  // This constructor takes the priors from class AmNnetSimple (so it can divide by
+  // them).
+  DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
+                                AmNnetSimple *nnet);
+
+  // this constructor is for use in testing.
+  DecodableNnetSimpleLoopedInfo(const NnetSimpleLoopedComputationOptions &opts,
+                                const Vector<BaseFloat> &priors,
+                                Nnet *nnet);
+
+  void Init(const NnetSimpleLoopedComputationOptions &opts,
+            Nnet *nnet);
+
+  const NnetSimpleLoopedComputationOptions &opts;
+
+  const Nnet &nnet;
+
+  // the log priors (or the empty vector if the priors are not set in the model)
+  CuVector<BaseFloat> log_priors;
+
+
+  // frames_left_context equals the model left context plus the value of the
+  // --extra-left-context-initial option.
+  int32 frames_left_context;
+  // frames_right_context is the same as the right-context of the model.
+  int32 frames_right_context;
+  // The frames_per_chunk_ equals the number of input frames we need for each
+  // chunk (except for the first chunk).  This divided by
+  // opts_.frame_subsampling_factor gives the number of output frames.
+  int32 frames_per_chunk;
+
+  // The output dimension of the neural network.
+  int32 output_dim;
+
+  // True if the neural net accepts iVectors.  If so, the neural net will have been modified
+  // to accept the iVectors
+  bool has_ivectors;
+
+  // The 3 computation requests that are used to create the looped
+  // computation are stored in the class, as we need them to work out
+  // exactly shich iVectors are needed.
+  ComputationRequest request1, request2, request3;
+
+  // The compiled, 'looped' computation.
+  NnetComputation computation;
+};
+
+/*
+  This class handles the neural net computation; it's mostly accessed
+  via other wrapper classes.
+
+  It can accept just input features, or input features plus iVectors.  */
+class DecodableNnetSimpleLooped {
+ public:
+  /**
+     This constructor takes features as input, and you can either supply a
+     single iVector input, estimated in batch-mode ('ivector'), or 'online'
+     iVectors ('online_ivectors' and 'online_ivector_period', or none at all.
+     Note: it stores references to all arguments to the constructor, so don't
+     delete them till this goes out of scope.
+
+     @param [in] info   This helper class contains all the static pre-computed information
+                        this class needs, and contains a pointer to the neural net.
+     @param [in] feats  The input feature matrix.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] online_ivectors
+                        If you are using iVectors estimated 'online'
+                        a pointer to the iVectors, else NULL.
+     @param [in] online_ivector_period If you are using iVectors estimated 'online'
+                        (i.e. if online_ivectors != NULL) gives the periodicity
+                        (in frames) with which the iVectors are estimated.
+  */
+  DecodableNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info,
+                            const MatrixBase<BaseFloat> &feats,
+                            const VectorBase<BaseFloat> *ivector = NULL,
+                            const MatrixBase<BaseFloat> *online_ivectors = NULL,
+                            int32 online_ivector_period = 1);
+
+
+  // returns the number of frames of likelihoods.  The same as feats_.NumRows()
+  // in the normal case (but may be less if opts_.frame_subsampling_factor !=
+  // 1).
+  inline int32 NumFrames() const { return num_subsampled_frames_; }
+
+  inline int32 OutputDim() const { return info_.output_dim; }
+
+  // Gets the output for a particular frame, with 0 <= frame < NumFrames().
+  // 'output' must be correctly sized (with dimension OutputDim()).  Note:
+  // you're expected to call this, and GetOutput(), in an order of increasing
+  // frames.  If you deviate from this, one of these calls may crash.
+  void GetOutputForFrame(int32 subsampled_frame,
+                         VectorBase<BaseFloat> *output);
+
+  // Gets the output for a particular frame and pdf_id, with
+  // 0 <= subsampled_frame < NumFrames(),
+  // and 0 <= pdf_id < OutputDim().
+  inline BaseFloat GetOutput(int32 subsampled_frame, int32 pdf_id) {
+    KALDI_ASSERT(subsampled_frame >= current_log_post_subsampled_offset_ &&
+                 "Frames must be accessed in order.");
+    while (subsampled_frame >= current_log_post_subsampled_offset_ +
+                            current_log_post_.NumRows())
+      AdvanceChunk();
+    return current_log_post_(subsampled_frame -
+                             current_log_post_subsampled_offset_,
+                             pdf_id);
+  }
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnetSimpleLooped);
+
+  // This function does the computation for the next chunk.
+  void AdvanceChunk();
+
+  void AdvanceChunkInternal(const MatrixBase<BaseFloat> &input_feats,
+                            const VectorBase<BaseFloat> &ivector);
+
+  // Gets the iVector for the specified frame., if we are
+  // using iVectors (else does nothing).
+  void GetCurrentIvector(int32 input_frame,
+                         Vector<BaseFloat> *ivector);
+
+  // returns dimension of the provided iVectors if supplied, or 0 otherwise.
+  int32 GetIvectorDim() const;
+
+  const DecodableNnetSimpleLoopedInfo &info_;
+
+  NnetComputer computer_;
+
+  const MatrixBase<BaseFloat> &feats_;
+  // note: num_subsampled_frames_ will equal feats_.NumRows() in the normal case
+  // when opts_.frame_subsampling_factor == 1.
+  int32 num_subsampled_frames_;
+
+  // ivector_ is the iVector if we're using iVectors that are estimated in batch
+  // mode.
+  const VectorBase<BaseFloat> *ivector_;
+
+  // online_ivector_feats_ is the iVectors if we're using online-estimated ones.
+  const MatrixBase<BaseFloat> *online_ivector_feats_;
+  // online_ivector_period_ helps us interpret online_ivector_feats_; it's the
+  // number of frames the rows of ivector_feats are separated by.
+  int32 online_ivector_period_;
+
+  // The current log-posteriors that we got from the last time we
+  // ran the computation.
+  Matrix<BaseFloat> current_log_post_;
+
+  // The number of chunks we have computed so far.
+  int32 num_chunks_computed_;
+
+  // The time-offset of the current log-posteriors, equals
+  // (num_chunks_computed_ - 1) *
+  //    (info_.frames_per_chunk_ / info_.opts_.frame_subsampling_factor).
+  int32 current_log_post_subsampled_offset_;
+};
+
+class DecodableAmNnetSimpleLooped: public DecodableInterface {
+ public:
+  /**
+     This constructor takes features as input, and you can either supply a
+     single iVector input, estimated in batch-mode ('ivector'), or 'online'
+     iVectors ('online_ivectors' and 'online_ivector_period', or none at all.
+     Note: it stores references to all arguments to the constructor, so don't
+     delete them till this goes out of scope.
+
+
+     @param [in] info   This helper class contains all the static pre-computed information
+                        this class needs, and contains a pointer to the neural net.  If
+                        you want prior subtraction to be done, you should have initialized
+                        this with the constructor that takes class AmNnetSimple.
+     @param [in] trans_model  The transition model to use.  This takes care of the
+                        mapping from transition-id (which is an arg to
+                        LogLikelihood()) to pdf-id (which is used internally).
+     @param [in] feats   A pointer to the input feature matrix; must be non-NULL.
+                         We
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] ivector If you are using iVectors estimated in batch mode,
+                         a pointer to the iVector, else NULL.
+     @param [in] online_ivectors
+                        If you are using iVectors estimated 'online'
+                        a pointer to the iVectors, else NULL.
+     @param [in] online_ivector_period If you are using iVectors estimated 'online'
+                        (i.e. if online_ivectors != NULL) gives the periodicity
+                        (in frames) with which the iVectors are estimated.
+  */
+  DecodableAmNnetSimpleLooped(const DecodableNnetSimpleLoopedInfo &info,
+                              const TransitionModel &trans_model,
+                              const MatrixBase<BaseFloat> &feats,
+                              const VectorBase<BaseFloat> *ivector = NULL,
+                              const MatrixBase<BaseFloat> *online_ivectors = NULL,
+                              int32 online_ivector_period = 1);
+
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id);
+
+  virtual inline int32 NumFramesReady() const {
+    return decodable_nnet_.NumFrames();
+  }
+
+  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmNnetSimpleLooped);
+  DecodableNnetSimpleLooped decodable_nnet_;
+  const TransitionModel &trans_model_;
+};
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif  // KALDI_NNET3_DECODABLE_SIMPLE_LOOPED_H_
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 223257e5a5f..94a165f4c50 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -24,14 +24,11 @@
 namespace kaldi {
 namespace discriminative {
 
-void DiscriminativeSupervisionOptions::Check() const {
-  KALDI_ASSERT(frame_subsampling_factor > 0);
-}
 
 DiscriminativeSupervision::DiscriminativeSupervision(
     const DiscriminativeSupervision &other):
     weight(other.weight), num_sequences(other.num_sequences),
-    frames_per_sequence(other.frames_per_sequence), 
+    frames_per_sequence(other.frames_per_sequence),
     num_ali(other.num_ali), den_lat(other.den_lat) { }
 
 void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
@@ -44,7 +41,7 @@ void DiscriminativeSupervision::Swap(DiscriminativeSupervision *other) {
 
 bool DiscriminativeSupervision::operator == (
     const DiscriminativeSupervision &other) const {
-  return ( weight == other.weight && 
+  return ( weight == other.weight &&
       num_sequences == other.num_sequences &&
       frames_per_sequence == other.frames_per_sequence &&
       num_ali == other.num_ali &&
@@ -61,14 +58,14 @@ void DiscriminativeSupervision::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, frames_per_sequence);
   KALDI_ASSERT(frames_per_sequence > 0 &&
                num_sequences > 0);
-  
+
   WriteToken(os, binary, "<NumAli>");
   WriteIntegerVector(os, binary, num_ali);
 
   WriteToken(os, binary, "<DenLat>");
   if (!WriteLattice(os, binary, den_lat)) {
     // We can't return error status from this function so we
-    // throw an exception. 
+    // throw an exception.
     KALDI_ERR << "Error writing denominator lattice to stream";
   }
 
@@ -83,9 +80,9 @@ void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &num_sequences);
   ExpectToken(is, binary, "<FramesPerSeq>");
   ReadBasicType(is, binary, &frames_per_sequence);
-  KALDI_ASSERT(frames_per_sequence > 0 && 
+  KALDI_ASSERT(frames_per_sequence > 0 &&
                num_sequences > 0);
-  
+
   ExpectToken(is, binary, "<NumAli>");
   ReadIntegerVector(is, binary, &num_ali);
 
@@ -94,7 +91,7 @@ void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
     Lattice *lat = NULL;
     if (!ReadLattice(is, binary, &lat) || lat == NULL) {
       // We can't return error status from this function so we
-      // throw an exception. 
+      // throw an exception.
       KALDI_ERR << "Error reading Lattice from stream";
     }
     den_lat = *lat;
@@ -106,7 +103,7 @@ void DiscriminativeSupervision::Read(std::istream &is, bool binary) {
 }
 
 bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
-                                           const Lattice &den_lat, 
+                                           const Lattice &den_lat,
                                            BaseFloat weight) {
   if (num_ali.size() == 0) return false;
   if (den_lat.NumStates() == 0) return false;
@@ -126,7 +123,7 @@ bool DiscriminativeSupervision::Initialize(const std::vector<int32> &num_ali,
 
 void DiscriminativeSupervision::Check() const {
   int32 num_frames_subsampled = num_ali.size();
-  KALDI_ASSERT(num_frames_subsampled == 
+  KALDI_ASSERT(num_frames_subsampled ==
                num_sequences * frames_per_sequence);
 
   {
@@ -150,14 +147,14 @@ DiscriminativeSupervisionSplitter::DiscriminativeSupervisionSplitter(
 
   den_lat_ = supervision_.den_lat;
   PrepareLattice(&den_lat_, &den_lat_scores_);
-  
+
   int32 num_states = den_lat_.NumStates(),
         num_frames = supervision_.frames_per_sequence * supervision_.num_sequences;
   KALDI_ASSERT(num_states > 0);
   int32 start_state = den_lat_.Start();
   // Lattice should be top-sorted and connected, so start-state must be 0.
   KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0");
-  
+
   KALDI_ASSERT(num_states == den_lat_scores_.state_times.size());
   KALDI_ASSERT(den_lat_scores_.state_times[start_state] == 0);
   KALDI_ASSERT(den_lat_scores_.state_times.back() == num_frames);
@@ -193,7 +190,7 @@ void DiscriminativeSupervisionSplitter::CollapseTransitionIds(
         pdf_to_tid[t][pdf] = arc.ilabel;
       }
     }
-  }    
+  }
 }
 
 void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
@@ -204,9 +201,9 @@ void DiscriminativeSupervisionSplitter::LatticeInfo::Check() const {
   // Check that the states are ordered in increasing order of state_times.
   // This must be true since the states are in breadth-first search order.
   KALDI_ASSERT(IsSorted(state_times));
-} 
+}
 
-void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize, 
+void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, bool normalize,
                                                       DiscriminativeSupervision *out_supervision) const {
   int32 end_frame = begin_frame + num_frames;
   // Note: end_frame is not included in the range of frames that the
@@ -224,7 +221,7 @@ void DiscriminativeSupervisionSplitter::GetFrameRange(int32 begin_frame, int32 n
   std::copy(supervision_.num_ali.begin() + begin_frame,
             supervision_.num_ali.begin() + end_frame,
             std::back_inserter(out_supervision->num_ali));
-  
+
   out_supervision->num_sequences = 1;
   out_supervision->weight = supervision_.weight;
   out_supervision->frames_per_sequence = num_frames;
@@ -239,19 +236,19 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
   typedef Lattice::StateId StateId;
 
   const std::vector<int32> &state_times = scores.state_times;
-  
-  // Some checks to ensure the lattice and scores are prepared properly 
+
+  // Some checks to ensure the lattice and scores are prepared properly
   KALDI_ASSERT(state_times.size() == in_lat.NumStates());
   if (!in_lat.Properties(fst::kTopSorted, true))
     KALDI_ERR << "Input lattice must be topologically sorted.";
 
   std::vector<int32>::const_iterator begin_iter =
       std::lower_bound(state_times.begin(), state_times.end(), begin_frame),
-      end_iter = std::lower_bound(begin_iter, 
+      end_iter = std::lower_bound(begin_iter,
                                   state_times.end(), end_frame);
 
   KALDI_ASSERT(*begin_iter == begin_frame &&
-               (begin_iter == state_times.begin() || 
+               (begin_iter == state_times.begin() ||
                 begin_iter[-1] < begin_frame));
   // even if end_frame == supervision_.num_frames, there should be a state with
   // that frame index.
@@ -267,10 +264,10 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
   // Add special start state
   StateId start_state = out_lat->AddState();
   out_lat->SetStart(start_state);
-  
+
   for (StateId i = begin_state; i < end_state; i++)
     out_lat->AddState();
-  
+
   // Add the special final-state.
   StateId final_state = out_lat->AddState();
   out_lat->SetFinal(final_state, LatticeWeight::One());
@@ -280,10 +277,10 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
     if (state_times[state] == begin_frame) {
       // we'd like to make this an initial state, but OpenFst doesn't allow
       // multiple initial states.  Instead we add an epsilon transition to it
-      // from our actual initial state.  The weight on this 
+      // from our actual initial state.  The weight on this
       // transition is the forward probability of the said 'initial state'
       LatticeWeight weight = LatticeWeight::One();
-      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]); 
+      weight.SetValue1((normalize ? scores.beta[0] : 0.0) - scores.alpha[state]);
       // Add negative of the forward log-probability to the graph cost score,
       // since the acoustic scores would be changed later.
       // Assuming that the lattice is scaled with appropriate acoustic
@@ -294,29 +291,29 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
       // Note: Doing a forward-backward on this split must result in a total
       // score of 0 because of the normalization.
 
-      out_lat->AddArc(start_state, 
+      out_lat->AddArc(start_state,
                       LatticeArc(0, 0, weight, output_state));
     } else {
       KALDI_ASSERT(scores.state_times[state] < end_frame);
     }
-    for (fst::ArcIterator<Lattice> aiter(in_lat, state); 
+    for (fst::ArcIterator<Lattice> aiter(in_lat, state);
           !aiter.Done(); aiter.Next()) {
       const LatticeArc &arc = aiter.Value();
       StateId nextstate = arc.nextstate;
       if (nextstate >= end_state) {
         // A transition to any state outside the range becomes a transition to
-        // our special final-state. 
-        // The weight is just the negative of the backward log-probability + 
+        // our special final-state.
+        // The weight is just the negative of the backward log-probability +
         // the arc cost. We again normalize with the total lattice score.
         LatticeWeight weight;
         //KALDI_ASSERT(scores.beta[state] < 0);
-        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]); 
+        weight.SetValue1(arc.weight.Value1() - scores.beta[nextstate]);
         weight.SetValue2(arc.weight.Value2());
         // Add negative of the backward log-probability to the LM score, since
         // the acoustic scores would be changed later.
         // Note: We don't normalize here because that is already done with the
         // initial cost.
-      
+
         out_lat->AddArc(output_state,
             LatticeArc(arc.ilabel, arc.olabel, weight, final_state));
       } else {
@@ -350,28 +347,28 @@ void DiscriminativeSupervisionSplitter::CreateRangeLattice(
     }
   }
 
-  fst::TopSort(out_lat);    
+  fst::TopSort(out_lat);
   std::vector<int32> state_times_tmp;
   KALDI_ASSERT(LatticeStateTimes(*out_lat, &state_times_tmp) ==
                                             end_frame - begin_frame);
 
   // Remove the acoustic scale that was previously added
-  if (config_.supervision_config.acoustic_scale != 1.0) {
+  if (config_.acoustic_scale != 1.0) {
     fst::ScaleLattice(fst::AcousticLatticeScale(
-          1 / config_.supervision_config.acoustic_scale), out_lat);
+          1 / config_.acoustic_scale), out_lat);
   }
 }
 
 void DiscriminativeSupervisionSplitter::PrepareLattice(
     Lattice *lat, LatticeInfo *scores) const {
-  // Scale the lattice to appropriate acoustic scale. It is important to 
-  // ensure this is equal to the acoustic scale used while training. This is 
-  // because, on splitting lattices, the initial and final costs are added 
+  // Scale the lattice to appropriate acoustic scale. It is important to
+  // ensure this is equal to the acoustic scale used while training. This is
+  // because, on splitting lattices, the initial and final costs are added
   // into the graph cost.
-  KALDI_ASSERT(config_.supervision_config.acoustic_scale != 0.0);
-  if (config_.supervision_config.acoustic_scale != 1.0)
+  KALDI_ASSERT(config_.acoustic_scale != 0.0);
+  if (config_.acoustic_scale != 1.0)
     fst::ScaleLattice(fst::AcousticLatticeScale(
-          config_.supervision_config.acoustic_scale), lat);
+        config_.acoustic_scale), lat);
 
   LatticeStateTimes(*lat, &(scores->state_times));
   int32 num_states = lat->NumStates();
@@ -383,7 +380,7 @@ void DiscriminativeSupervisionSplitter::PrepareLattice(
   // Order the states based on the state times. This is stronger than just
   // topological sort. This is required by the lattice splitting code.
   std::sort(state_time_indexes.begin(), state_time_indexes.end());
-  
+
   std::vector<int32> state_order(num_states);
   for (int32 s = 0; s < num_states; s++) {
     state_order[state_time_indexes[s].second] = s;
@@ -396,9 +393,9 @@ void DiscriminativeSupervisionSplitter::PrepareLattice(
 void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
     LatticeInfo *scores) const {
   LatticeStateTimes(lat, &(scores->state_times));
-  ComputeLatticeAlphasAndBetas(lat, false, 
+  ComputeLatticeAlphasAndBetas(lat, false,
                                &(scores->alpha), &(scores->beta));
-  scores->Check();  
+  scores->Check();
   // This check will fail if the lattice is not breadth-first search sorted
 }
 
@@ -427,7 +424,7 @@ void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &inpu
       fst::Concat(&output_supervision->back().den_lat, src.den_lat);
 
       output_supervision->back().num_ali.insert(
-          output_supervision->back().num_ali.end(), 
+          output_supervision->back().num_ali.end(),
           src.num_ali.begin(), src.num_ali.end());
 
       output_supervision->back().num_sequences++;
@@ -448,5 +445,5 @@ void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &inpu
   }
 }
 
-} // namespace discriminative 
+} // namespace discriminative
 } // namespace kaldi
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
index c5cdc7a4107..a9d58d120f5 100644
--- a/src/nnet3/discriminative-supervision.h
+++ b/src/nnet3/discriminative-supervision.h
@@ -29,37 +29,20 @@
 namespace kaldi {
 namespace discriminative {
 
-struct DiscriminativeSupervisionOptions {
-  int32 frame_subsampling_factor;
-  BaseFloat acoustic_scale;
-
-  DiscriminativeSupervisionOptions(): frame_subsampling_factor(1), acoustic_scale(0.1) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                   "if the frame-rate for the model will be less than the "
-                   "frame-rate of the original alignment.  Applied after "
-                   "left-tolerance and right-tolerance are applied (so they are "
-                   "in terms of the original num-frames.");
-    opts->Register("acoustic-scale", &acoustic_scale,
-                   "Scaling factor for acoustic likelihoods");
-  }
-
-  void Check() const;
-};
 
 struct SplitDiscriminativeSupervisionOptions {
+  int32 frame_subsampling_factor;
   bool remove_output_symbols;
   bool collapse_transition_ids;
   bool remove_epsilons;
   bool determinize;
   bool minimize; // we'll push and minimize if this is true.
-  DiscriminativeSupervisionOptions supervision_config;
-  
+  BaseFloat acoustic_scale;
+
   SplitDiscriminativeSupervisionOptions() :
-    remove_output_symbols(false), collapse_transition_ids(false), 
-    remove_epsilons(false), determinize(false),
-    minimize(false) { }
+      remove_output_symbols(true), collapse_transition_ids(true),
+      remove_epsilons(true), determinize(true),
+      minimize(true), acoustic_scale(0.1) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("collapse-transition-ids", &collapse_transition_ids,
@@ -76,7 +59,9 @@ struct SplitDiscriminativeSupervisionOptions {
                    "lattices (as Lattice) after splitting and possibly minimize");
     opts->Register("minimize", &minimize, "If true, we push and "
                    "minimize lattices (as Lattice) after splitting");
-    supervision_config.Register(opts);
+    opts->Register("acoustic-scale", &acoustic_scale,
+                   "Scaling factor for acoustic likelihoods (should match the "
+                   "value used in discriminative-get-supervision)");
   }
 };
 
@@ -86,13 +71,13 @@ struct SplitDiscriminativeSupervisionOptions {
 */
 
 // struct DiscriminativeSupervision is the fully-processed information for
-// a whole utterance or (after splitting) part of an utterance. 
+// a whole utterance or (after splitting) part of an utterance.
 struct DiscriminativeSupervision {
   // The weight we assign to this example;
   // this will typically be one, but we include it
-  // for the sake of generality.  
-  BaseFloat weight; 
-  
+  // for the sake of generality.
+  BaseFloat weight;
+
   // num_sequences will be 1 if you create a DiscriminativeSupervision object from a single
   // lattice or alignment, but if you combine multiple DiscriminativeSupervision objects
   // the 'num_sequences' is the number of objects that were combined (the
@@ -104,20 +89,20 @@ struct DiscriminativeSupervision {
   // Technically this information is redundant with the lattices, but it's convenient
   // to have it separately.
   int32 frames_per_sequence;
-  
+
   // The numerator alignment
   // Usually obtained by aligning the reference text with the seed neural
   // network model; can be the best path of generated lattice in the case of
   // semi-supervised training.
   std::vector<int32> num_ali;
-  
+
   // Note: any acoustic
   // likelihoods in the lattices will be
   // recomputed at the time we train.
-  
-  // The denominator lattice.  
-  Lattice den_lat; 
-  
+
+  // The denominator lattice.
+  Lattice den_lat;
+
   DiscriminativeSupervision(): weight(1.0), num_sequences(1),
                                frames_per_sequence(-1) { }
 
@@ -128,7 +113,7 @@ struct DiscriminativeSupervision {
   // and denominator lattice.  The supervision object is used for sequence
   // discriminative training.
   // Topologically sorts the lattice after copying to the supervision object.
-  // Returns false when alignment or lattice is empty 
+  // Returns false when alignment or lattice is empty
   bool Initialize(const std::vector<int32> &alignment,
                   const Lattice &lat,
                   BaseFloat weight);
@@ -136,13 +121,13 @@ struct DiscriminativeSupervision {
   void Swap(DiscriminativeSupervision *other);
 
   bool operator == (const DiscriminativeSupervision &other) const;
-  
+
   // This function checks that this supervision object satifsies some
   // of the properties we expect of it, and calls KALDI_ERR if not.
   void Check() const;
-  
-  inline int32 NumFrames() const { 
-    return num_sequences * frames_per_sequence; 
+
+  inline int32 NumFrames() const {
+    return num_sequences * frames_per_sequence;
   }
 
   void Write(std::ostream &os, bool binary) const;
@@ -156,30 +141,30 @@ class DiscriminativeSupervisionSplitter {
  public:
   typedef fst::ArcTpl<LatticeWeight> LatticeArc;
   typedef fst::VectorFst<LatticeArc> Lattice;
- 
+
   DiscriminativeSupervisionSplitter(
       const SplitDiscriminativeSupervisionOptions &config,
       const TransitionModel &tmodel,
       const DiscriminativeSupervision &supervision);
 
-  // A structure used to store the forward and backward scores 
+  // A structure used to store the forward and backward scores
   // and state times of a lattice
   struct LatticeInfo {
-    // These values are stored in log. 
+    // These values are stored in log.
     std::vector<double> alpha;
     std::vector<double> beta;
     std::vector<int32> state_times;
 
     void Check() const;
   };
-  
-  // Extracts a frame range of the supervision into 'supervision'.  
+
+  // Extracts a frame range of the supervision into 'supervision'.
   void GetFrameRange(int32 begin_frame, int32 frames_per_sequence,
                      bool normalize,
                      DiscriminativeSupervision *supervision) const;
 
   // Get the acoustic scaled denominator lattice out for debugging purposes
-  inline const Lattice& DenLat() const { return den_lat_; }  
+  inline const Lattice& DenLat() const { return den_lat_; }
 
  private:
 
@@ -187,7 +172,7 @@ class DiscriminativeSupervisionSplitter {
   // assuming that the corresponding state-range that we need to
   // include, begin_state <= s < end_state has been included.
   // (note: the output lattice will also have two special initial and final
-  // states).  
+  // states).
   // Also does post-processing (RmEpsilon, Determinize,
   // TopSort on the result).  See code for details.
   void CreateRangeLattice(const Lattice &in_lat,
@@ -201,7 +186,7 @@ class DiscriminativeSupervisionSplitter {
   // Transition model is used by the function
   // CollapseTransitionIds()
   const TransitionModel &tmodel_;
-  
+
   // A reference to the supervision object that we will be splitting
   const DiscriminativeSupervision &supervision_;
 
@@ -216,7 +201,7 @@ class DiscriminativeSupervisionSplitter {
   // Function to compute lattice scores for a lattice
   void ComputeLatticeScores(const Lattice &lat, LatticeInfo *scores) const;
 
-  // Prepare lattice : 
+  // Prepare lattice :
   // 1) Order states in breadth-first search order
   // 2) Compute states times, which must be a strictly non-decreasing vector
   // 3) Compute lattice alpha and beta scores
@@ -225,7 +210,7 @@ class DiscriminativeSupervisionSplitter {
   // Modifies the transition-ids on lat_ so that on each frame, there is just
   // one with any given pdf-id.  This allows us to determinize and minimize
   // more completely.
-  void CollapseTransitionIds(const std::vector<int32> &state_times, 
+  void CollapseTransitionIds(const std::vector<int32> &state_times,
                              Lattice *lat) const;
 
 };
@@ -241,9 +226,6 @@ void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &inpu
     bool compactify,
     std::vector<DiscriminativeSupervision> *output_supervision);
 
-typedef TableWriter<KaldiObjectHolder<DiscriminativeSupervision> > DiscriminativeSupervisionWriter;
-typedef SequentialTableReader<KaldiObjectHolder<DiscriminativeSupervision> > SequentialDiscriminativeSupervisionReader;
-typedef RandomAccessTableReader<KaldiObjectHolder<DiscriminativeSupervision> > RandomAccessDiscriminativeSupervisionReader;
 
 } // namespace discriminative
 } // namespace kaldi
diff --git a/src/nnet3/discriminative-training.cc b/src/nnet3/discriminative-training.cc
index 438a01aafd9..4a32236c9ff 100644
--- a/src/nnet3/discriminative-training.cc
+++ b/src/nnet3/discriminative-training.cc
@@ -594,7 +594,7 @@ void DiscriminativeObjectiveInfo::Print(const std::string &criterion,
   } else if (criterion == "mpfe") {
     double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
     double objf = tot_objf / tot_t_weighted;
-    KALDI_LOG << "Average modulus of MPFE gradients is " << avg_gradients
+    KALDI_LOG << "Average num+den count of MPFE stats is " << avg_gradients
               << " per frame, over "
               << tot_t_weighted << " frames";
     KALDI_LOG << "MPFE objective function is " << objf
@@ -602,7 +602,7 @@ void DiscriminativeObjectiveInfo::Print(const std::string &criterion,
   } else if (criterion == "smbr") {
     double avg_gradients = (tot_num_count + tot_den_count) / tot_t_weighted;
     double objf = tot_objf / tot_t_weighted;
-    KALDI_LOG << "Average modulus of SMBR gradients is " << avg_gradients
+    KALDI_LOG << "Average num+den count of SMBR stats is " << avg_gradients
               << " per frame, over "
               << tot_t_weighted << " frames";
     KALDI_LOG << "SMBR objective function is " << objf
@@ -642,4 +642,3 @@ void DiscriminativeObjectiveInfo::PrintAvgGradientForPdf(int32 pdf_id) const {
 
 }  // namespace discriminative
 }  // namespace kaldi
-
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index bc851790a05..35b1506336e 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -64,7 +64,7 @@ DecodableAmNnetSimple::DecodableAmNnetSimple(
     const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period,
     CachingOptimizingCompiler *compiler):
-    compiler_(am_nnet.GetNnet(), opts.optimize_config),
+    compiler_(am_nnet.GetNnet(), opts.optimize_config, opts.compiler_config),
     decodable_nnet_(opts, am_nnet.GetNnet(), am_nnet.Priors(),
                     feats, compiler != NULL ? compiler : &compiler_,
                     ivector, online_ivectors,
@@ -261,7 +261,7 @@ void DecodableNnetSimple::DoNnetComputation(
     ivector_feats_cu.Row(0).CopyFromVec(ivector);
     computer.AcceptInput("ivector", &ivector_feats_cu);
   }
-  computer.Forward();
+  computer.Run();
   CuMatrix<BaseFloat> cu_output;
   computer.GetOutputDestructive("output", &cu_output);
   // subtract log-prior (divide by prior)
@@ -276,32 +276,37 @@ void DecodableNnetSimple::DoNnetComputation(
 }
 
 void DecodableNnetSimple::CheckAndFixConfigs() {
-  static bool warned_modulus = false,
-      warned_subsampling = false;
+  static bool warned_frames_per_chunk = false;
   int32 nnet_modulus = nnet_.Modulus();
   if (opts_.frame_subsampling_factor < 1 ||
       opts_.frames_per_chunk < 1)
     KALDI_ERR << "--frame-subsampling-factor and --frames-per-chunk must be > 0";
-  if (opts_.frames_per_chunk % opts_.frame_subsampling_factor != 0) {
-    int32 f = opts_.frame_subsampling_factor,
-        frames_per_chunk = f * ((opts_.frames_per_chunk + f - 1) / f);
-    if (!warned_subsampling) {
-      warned_subsampling = true;
-      KALDI_LOG << "Increasing --frames-per-chunk from "
-                << opts_.frames_per_chunk << " to "
-                << frames_per_chunk << " to make it a multiple of "
-                << "--frame-subsampling-factor="
-                << opts_.frame_subsampling_factor;
+  KALDI_ASSERT(nnet_modulus > 0);
+  int32 n = Lcm(opts_.frame_subsampling_factor, nnet_modulus);
+
+  if (opts_.frames_per_chunk % n != 0) {
+    // round up to the nearest multiple of n.
+    int32 frames_per_chunk = n * ((opts_.frames_per_chunk + n - 1) / n);
+    if (!warned_frames_per_chunk) {
+      warned_frames_per_chunk = true;
+      if (nnet_modulus == 1) {
+        // simpler error message.
+        KALDI_LOG << "Increasing --frames-per-chunk from "
+                  << opts_.frames_per_chunk << " to "
+                  << frames_per_chunk << " to make it a multiple of "
+                  << "--frame-subsampling-factor="
+                  << opts_.frame_subsampling_factor;
+      } else {
+        KALDI_LOG << "Increasing --frames-per-chunk from "
+                  << opts_.frames_per_chunk << " to "
+                  << frames_per_chunk << " due to "
+                  << "--frame-subsampling-factor="
+                  << opts_.frame_subsampling_factor << " and "
+                  << "nnet shift-invariance modulus = " << nnet_modulus;
+      }
     }
     opts_.frames_per_chunk = frames_per_chunk;
   }
-  if (opts_.frames_per_chunk % nnet_modulus != 0 && !warned_modulus) {
-    warned_modulus = true;
-    KALDI_WARN << "It may be more efficient to set the --frames-per-chunk "
-               << "(currently " << opts_.frames_per_chunk << " to a "
-               << "multiple of the network's shift-invariance modulus "
-               << nnet_modulus;
-  }
 }
 
 
@@ -313,7 +318,7 @@ DecodableAmNnetSimpleParallel::DecodableAmNnetSimpleParallel(
     const VectorBase<BaseFloat> *ivector,
     const MatrixBase<BaseFloat> *online_ivectors,
     int32 online_ivector_period):
-    compiler_(am_nnet.GetNnet(), opts.optimize_config),
+    compiler_(am_nnet.GetNnet(), opts.optimize_config, opts.compiler_config),
     trans_model_(trans_model),
     feats_copy_(NULL),
     ivector_copy_(NULL),
diff --git a/src/nnet3/nnet-am-decodable-simple.h b/src/nnet3/nnet-am-decodable-simple.h
index 5f7a0307abe..1895303d125 100644
--- a/src/nnet3/nnet-am-decodable-simple.h
+++ b/src/nnet3/nnet-am-decodable-simple.h
@@ -33,6 +33,11 @@ namespace kaldi {
 namespace nnet3 {
 
 
+// See also the decodable object in decodable-simple-looped.h, which is better
+// and faster in most situations, including TDNNs and LSTMs (but not for
+// BLSTMs).
+
+
 // Note: the 'simple' in the name means it applies to networks
 // for which IsSimpleNnet(nnet) would return true.
 struct NnetSimpleComputationOptions {
@@ -46,6 +51,7 @@ struct NnetSimpleComputationOptions {
   bool debug_computation;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
+  CachingOptimizingCompilerOptions compiler_config;
 
   NnetSimpleComputationOptions():
       extra_left_context(0),
@@ -55,7 +61,9 @@ struct NnetSimpleComputationOptions {
       frame_subsampling_factor(1),
       frames_per_chunk(50),
       acoustic_scale(0.1),
-      debug_computation(false) { }
+      debug_computation(false) {
+    compiler_config.cache_capacity += frames_per_chunk;
+  }
 
   void Register(OptionsItf *opts) {
     opts->Register("extra-left-context", &extra_left_context,
@@ -67,11 +75,11 @@ struct NnetSimpleComputationOptions {
                    "of the neural net's inherent right context (may be useful in "
                    "recurrent setups");
     opts->Register("extra-left-context-initial", &extra_left_context_initial,
-                   "If >0, overrides the --extra-left-context value at the start "
-                   "of an utterance.");
+                   "If >= 0, overrides the --extra-left-context value at the "
+                   "start of an utterance.");
     opts->Register("extra-right-context-final", &extra_right_context_final,
-                   "If >0, overrides the --extra-right-context value at the end "
-                   "of an utterance.");
+                   "If >= 0, overrides the --extra-right-context value at the "
+                   "end of an utterance.");
     opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
                    "Required if the frame-rate of the output (e.g. in 'chain' "
                    "models) is less than the frame-rate of the original "
@@ -251,9 +259,11 @@ class DecodableAmNnetSimple: public DecodableInterface {
      @param [in] opts   The options class.  Warning: it includes an acoustic
                         weight, whose default is 0.1; you may sometimes want to
                         change this to 1.0.
-     @param [in] nnet   The neural net that we're going to do the computation with
-     @param [in] priors Vector of priors-- if supplied and nonempty, we subtract
-                        the log of these priors from the nnet output.
+     @param [in] trans_model  The transition model to use.  This takes care of the
+                        mapping from transition-id (which is an arg to
+                        LogLikelihood()) to pdf-id (which is used internally).
+     @param [in] am_nnet   The neural net that we're going to do the computation with;
+                         we also get the priors to divide by, if applicable, from here.
      @param [in] feats   A pointer to the input feature matrix; must be non-NULL.
                          We
      @param [in] ivector If you are using iVectors estimated in batch mode,
@@ -318,7 +328,7 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
             CachingOptimizingCompiler-- because making that thread safe
             would be quite complicated, and in any case multi-threaded
             decoding probably makes the most sense when using CPU, and
-            in that case won't expect the compilation phase to dominate.
+            in that case we don't expect the compilation phase to dominate.
 
      This constructor takes features as input, and you can either supply a
      single iVector input, estimated in batch-mode ('ivector'), or 'online'
@@ -329,13 +339,12 @@ class DecodableAmNnetSimpleParallel: public DecodableInterface {
      @param [in] opts   The options class.  Warning: it includes an acoustic
                         weight, whose default is 0.1; you may sometimes want to
                         change this to 1.0.
-     @param [in] nnet   The neural net that we're going to do the computation with
-     @param [in] priors Vector of priors-- if supplied and nonempty, we subtract
-                        the log of these priors from the nnet output.
+     @param [in] trans_model  The transition model to use.  This takes care of the
+                        mapping from transition-id (which is an arg to
+                        LogLikelihood()) to pdf-id (which is used internally).
+     @param [in] am_nnet The neural net that we're going to do the computation with;
+                        it may provide priors to divide by.
      @param [in] feats   A pointer to the input feature matrix; must be non-NULL.
-                         We
-     @param [in] ivector If you are using iVectors estimated in batch mode,
-                         a pointer to the iVector, else NULL.
      @param [in] ivector If you are using iVectors estimated in batch mode,
                          a pointer to the iVector, else NULL.
      @param [in] online_ivectors
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 3f04732848c..c5fedf0240b 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -212,7 +212,7 @@ std::string ComputationVariables::DescribeVariable(int32 variable) const {
       num_column_variables = column_split_points_[matrix_index].size() - 1,
       num_row_variables = row_split_points_[matrix_index].size() - 1,
       column_variable = offset % num_column_variables,
-      row_variable = offset / num_row_variables;
+      row_variable = offset / num_column_variables;
   KALDI_ASSERT(column_variable >= 0 && row_variable >= 0 &&
                row_variable < num_row_variables &&
                column_variable < num_column_variables);
@@ -278,8 +278,7 @@ void ComputeCommandAttributes(
     switch (c.command_type) {
       case kAllocMatrixZeroed:
       case kAllocMatrixFromOtherZeroed:
-        vars.AppendVariablesForMatrix(c.arg1, &attr.variables_written);
-        attr.matrices_written.push_back(c.arg1);
+        vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
         break;
       case kAllocMatrixUndefined: // nothing is written here.
       case kDeallocMatrix: // ditto.
@@ -372,8 +371,18 @@ void ComputeCommandAttributes(
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
       }
+      case kAcceptInput: {
+        vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
+        break;
+      }
+      case kProvideOutput: {
+        vars.RecordAccessForSubmatrix(c.arg1, kReadAccess, &attr);
+        break;
+      }
       case kNoOperation:
       case kNoOperationMarker:
+      case kNoOperationLabel:
+      case kGotoLabel:
         break;
       default:
         KALDI_ERR << "Unknown command type.";
@@ -478,68 +487,65 @@ void ComputeMatrixAccesses(
             Access(c, kWriteAccess));
       }
     }
-    // Now set up allocate_command and deallocate_command.
+    // Now set up allocate_command, deallocate_command,
+    // is_input and is_output.
     const NnetComputation::Command &command = computation.commands[c];
-    int32 matrix_index = command.arg1,
-        matrix_index2 = command.arg2;
+    int32 matrix_index1, matrix_index2;
+
 
     switch (command.command_type) {
       case kAllocMatrixZeroed:
       case kAllocMatrixUndefined:
-        if ((*matrix_accesses)[matrix_index].allocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " initialized twice.";
-        (*matrix_accesses)[matrix_index].allocate_command = c;
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        if ((*matrix_accesses)[matrix_index1].allocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index1 << " initialized twice.";
+        (*matrix_accesses)[matrix_index1].allocate_command = c;
         break;
       case kAllocMatrixFromOther:
       case kAllocMatrixFromOtherZeroed:
-        if ((*matrix_accesses)[matrix_index].allocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " initialized twice.";
-        (*matrix_accesses)[matrix_index].allocate_command = c;
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        KALDI_ASSERT(computation.IsWholeMatrix(command.arg2));
+        matrix_index2 = computation.submatrices[command.arg2].matrix_index;
+        if ((*matrix_accesses)[matrix_index1].allocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index1 << " initialized twice.";
+        (*matrix_accesses)[matrix_index1].allocate_command = c;
         if ((*matrix_accesses)[matrix_index2].deallocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " destroyed twice.";
+          KALDI_ERR << "Matrix " << matrix_index2 << " destroyed twice.";
         (*matrix_accesses)[matrix_index2].deallocate_command = c;
         break;
       case kDeallocMatrix:
-        if ((*matrix_accesses)[matrix_index].deallocate_command != -1)
-          KALDI_ERR << "Matrix " << matrix_index << " destroyed twice.";
-        (*matrix_accesses)[matrix_index].deallocate_command = c;
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        if ((*matrix_accesses)[matrix_index1].deallocate_command != -1)
+          KALDI_ERR << "Matrix " << matrix_index1 << " destroyed twice.";
+        (*matrix_accesses)[matrix_index1].deallocate_command = c;
+        break;
+      case kAcceptInput:
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        (*matrix_accesses)[matrix_index1].is_input = true;
+        // If a certain matrix is accepted as input multiple times, we
+        // count the first one as allocating it (the second will just
+        // allocate it again, which is harmless).
+        if ((*matrix_accesses)[matrix_index1].allocate_command == -1)
+          (*matrix_accesses)[matrix_index1].allocate_command = c;
+        break;
+      case kProvideOutput:
+        if (!computation.IsWholeMatrix(command.arg1))
+          KALDI_ERR << "Command does not operate on whole matrix";
+        matrix_index1 = computation.submatrices[command.arg1].matrix_index;
+        (*matrix_accesses)[matrix_index1].is_output = true;
         break;
       default:
         ;
     }
   }
-  // now set up the is_input and is_output fields.
-  unordered_map<int32, std::pair<int32, int32> >::const_iterator
-      iter = computation.input_output_info.begin(),
-      end = computation.input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 node_index = iter->first,
-        value_matrix_index = iter->second.first,
-        deriv_matrix_index = iter->second.second;
-    KALDI_ASSERT(value_matrix_index > 0 && value_matrix_index < num_matrices);
-    if (nnet.IsInputNode(node_index)) {
-      // the assert checks for repeats
-      KALDI_ASSERT(!(*matrix_accesses)[value_matrix_index].is_input);
-      (*matrix_accesses)[value_matrix_index].is_input = true;
-      if (deriv_matrix_index != 0) {
-        // the derivatives, if requested, would be outputs of the computation,
-        // even though the node is an input node.
-        KALDI_ASSERT(!(*matrix_accesses)[deriv_matrix_index].is_output);
-        (*matrix_accesses)[deriv_matrix_index].is_output = true;
-      }
-    } else {
-      KALDI_ASSERT(nnet.IsOutputNode(node_index));
-      // the assert checks for repeats
-      KALDI_ASSERT(!(*matrix_accesses)[value_matrix_index].is_output);
-      (*matrix_accesses)[value_matrix_index].is_output = true;
-      if (deriv_matrix_index != 0) {
-        // the derivatives, if provided, would be inputs to the computation,
-        // even though the node is an output node.
-        KALDI_ASSERT(!(*matrix_accesses)[deriv_matrix_index].is_input);
-        (*matrix_accesses)[deriv_matrix_index].is_input = true;
-      }
-    }
-  }
 }
 
 
@@ -554,7 +560,6 @@ ComputationChecker::ComputationChecker(
 void ComputationChecker::Check() {
   CheckComputationIndexes();
   a_.Init(nnet_, computation_);
-  CheckComputationOrder();
   CheckComputationMatrixAccesses();
   CheckComputationUndefined();
   CheckComputationDebugInfo();
@@ -575,10 +580,13 @@ void ComputationChecker::CheckComputationRewrite() const {
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
-    int32 matrix_index = a_.variables.GetMatrixForVariable(v);
-    if (accesses.empty() && ! a_.matrix_accesses[matrix_index].is_input) {
-      KALDI_ERR << "Variable " << v << " = " << a_.variables.DescribeVariable(v)
-                << "is never used.";
+    if (accesses.empty()) {
+      if (config_.check_unused_variables) {
+        KALDI_ERR << "Variable " << v << " = " << a_.variables.DescribeVariable(v)
+                  << " is never used.";
+      } else {
+        continue;
+      }
     }
     int32 num_accesses = accesses.size();
     int32 first_pure_read = -1;
@@ -594,8 +602,8 @@ void ComputationChecker::CheckComputationRewrite() const {
         if (accesses[access].access_type != kReadAccess) {
           KALDI_ERR << "Variable " << v << " = "
                     << a_.variables.DescribeVariable(v)
-                    << "is modified after being read "
-                    << "(this is not expected before optimization)";
+                    << " is modified after being read"
+                    << " (this is not expected before optimization)";
         }
       }
     }
@@ -610,16 +618,16 @@ void ComputationChecker::CheckComputationUndefined() const {
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
-    int32 matrix_index = a_.variables.GetMatrixForVariable(v);
-    bool is_input = a_.matrix_accesses[matrix_index].is_input;
-    if (! is_input) {
-      if (accesses.empty())
+    if (accesses.empty()) {
+      if (config_.check_unused_variables) {
         KALDI_ERR << "Variable " << v << " == "
                   << a_.variables.DescribeVariable(v) << "is never used.";
+      }
+    } else {
       if (accesses[0].access_type != kWriteAccess)
         KALDI_ERR << "Variable " << v << " == "
                   << a_.variables.DescribeVariable(v)
-                  << "is read before it is written to";
+                  << " is read before it is written to";
     }
   }
 }
@@ -637,45 +645,35 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
 
   for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
     const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index];
-    if (accesses.is_input) {
-      if (accesses.allocate_command != -1)
-        KALDI_ERR << "Input matrix is initialized.";
-    } else {
-      if (accesses.allocate_command == -1)
-        KALDI_ERR << "Matrix m" << matrix_index << "is not initialized.";
-      if (accesses.accesses.empty()) {
-        KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
-      } else if (accesses.accesses.front().command_index <
-                 accesses.allocate_command) {
-        KALDI_ERR << "Matrix m" << matrix_index << " is accessed before "
-            "it is initialized";
-      }
+    if (accesses.allocate_command == -1)
+      KALDI_ERR << "Matrix m" << matrix_index << " is not initialized.";
+    if (accesses.accesses.empty()) {
+      KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
+    } else if (accesses.accesses.front().command_index <
+               accesses.allocate_command) {
+      KALDI_ERR << "Matrix m" << matrix_index << " is accessed before "
+          "it is initialized";
     }
-    if (accesses.is_output) {
-      if (accesses.deallocate_command != -1)
-        KALDI_ERR << "Output matrix is destroyed.";
-    } else {
-      if (accesses.deallocate_command == -1)
-        KALDI_ERR << "Matrix m" << matrix_index << " is not destroyed.";
-      if (accesses.accesses.empty()) {
-        if (accesses.is_input) {
-          // we allow there to be no accesses if it is an input, e.g. if an
-          // output derivative is supplied for some reason but never used.
-          // We'll warn, though (once).
-          if (!computation_checker_warned_unused_input) {
-            KALDI_WARN << "Matrix m" << matrix_index << " is never accessed. "
-                "Allowing because it is an input (un-needed input or "
-                "derivative?)  Will warn only once.";
-            computation_checker_warned_unused_input = true;
-          }
-        } else {
-          KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
+
+    if (accesses.accesses.empty()) {
+      if (accesses.is_input) {
+        // we allow there to be no accesses if it is an input, e.g. if an
+        // output derivative is supplied for some reason but never used.
+        // We'll warn, though (once).
+        if (!computation_checker_warned_unused_input) {
+          KALDI_WARN << "Matrix m" << matrix_index << " is never accessed. "
+              "Allowing because it is an input (un-needed input or "
+              "derivative?)  Will warn only once.";
+          computation_checker_warned_unused_input = true;
         }
-      } else if (accesses.accesses.back().command_index >=
-                 accesses.deallocate_command) {
-        KALDI_ERR << "Matrix m" << matrix_index << " is accessed after "
-            "it is destroyed";
+      } else {
+        KALDI_ERR << "Matrix m" << matrix_index << " is never accessed.";
       }
+    } else if (accesses.deallocate_command != -1 &&
+               accesses.accesses.back().command_index >=
+               accesses.deallocate_command) {
+      KALDI_ERR << "Matrix m" << matrix_index << " is accessed after "
+          "it is destroyed";
     }
   }
 }
@@ -687,7 +685,6 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
 */
 void ComputationChecker::CheckComputationIndexes() const {
   int32 num_commands = computation_.commands.size(),
-      num_matrices = computation_.matrices.size(),
       num_submatrices = computation_.submatrices.size();
   const std::vector<NnetComputation::SubMatrixInfo> &submatrices =
       computation_.submatrices;
@@ -698,18 +695,21 @@ void ComputationChecker::CheckComputationIndexes() const {
       case kAllocMatrixZeroed:
       case kAllocMatrixUndefined:
       case kDeallocMatrix:
-        if (c.arg1 < 1 || c.arg1 >= num_matrices)
-          KALDI_ERR << "matrix index out of range.";
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
         break;
       case kAllocMatrixFromOther:
       case kAllocMatrixFromOtherZeroed:
-        if (c.arg1 < 1 || c.arg1 >= num_matrices ||
-            c.arg2 < 1 || c.arg2 >= num_matrices)
-          KALDI_ERR << "matrix index out of range.";
-        if (computation_.matrices[c.arg1].num_rows !=
-            computation_.matrices[c.arg2].num_rows ||
-            computation_.matrices[c.arg1].num_cols !=
-            computation_.matrices[c.arg2].num_cols)
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1) ||
+            c.arg2 < 1 || c.arg2 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg2))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        if (computation_.submatrices[c.arg1].num_rows !=
+            computation_.submatrices[c.arg2].num_rows ||
+            computation_.submatrices[c.arg1].num_cols !=
+            computation_.submatrices[c.arg2].num_cols)
           KALDI_ERR << "Dimension mismatch in kAllocMatrixFromOther* command";
         break;
       case kPropagate: {
@@ -725,10 +725,10 @@ void ComputationChecker::CheckComputationIndexes() const {
         // note: input may be the empty matrix (in unusual circumstances, for non-simple
         // components).
         if (c.arg3 < 0 || c.arg3 >= num_submatrices ||
-            (c.arg3 == 0 && !(properties & kSimpleComponent)) ||
+            (c.arg3 == 0 && (properties & kSimpleComponent)) ||
             c.arg4 < 1 || c.arg4 >= num_submatrices)
-          KALDI_ERR << "Sub-matrix indexes out of range.";
-        if (submatrices[c.arg3].num_cols != component->InputDim())
+            KALDI_ERR << "Sub-matrix indexes out of range.";
+        if (c.arg3 > 0 && submatrices[c.arg3].num_cols != component->InputDim())
           KALDI_ERR << "Input-dim mismatch.";
         if (submatrices[c.arg4].num_cols != component->OutputDim())
           KALDI_ERR << "Input-dim mismatch.";
@@ -914,51 +914,36 @@ void ComputationChecker::CheckComputationIndexes() const {
         }
         break;
       }
+      case kAcceptInput: case kProvideOutput: {
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        // note: we may later change the following condition to allow component
+        // nodes.  we allow it on output node because of derivatives.
+        if (!nnet_.IsInputNode(c.arg2) && !nnet_.IsOutputNode(c.arg2))
+          KALDI_ERR << "Invalid network node";
+        break;
+      }
       case kNoOperation:
       case kNoOperationMarker:
+      case kNoOperationLabel:
         break;
+      case kGotoLabel: {
+        int32 label_index = c.arg1;
+        if (label_index < 0 || label_index >= command_index ||
+            computation_.commands[label_index].command_type != kNoOperationLabel)
+          KALDI_ERR << "kGotoLabel command has invalid destination index.";
+        break;
+        if (command_index + 1 != num_commands) {
+          KALDI_ERR << "kGotoLabel is not the last command in the computation";
+        }
+      }
       default:
         KALDI_ERR << "Unknown command type.";
     }
   }
 }
 
-
-// make sure Propagate comes before kNoOperationMarker and Backprop comes after
-// it, and that the value of computation_computation_end matches the position of
-// kNoOpMarker.
-void ComputationChecker::CheckComputationOrder() const {
-  int32 num_commands = computation_.commands.size();
-  int32 num_markers = 0, marker_location = 0;
-  for (int32 c = 0; c < num_commands; c++) {
-    if (computation_.commands[c].command_type ==
-        kNoOperationMarker) {
-      marker_location = c;
-      num_markers++;
-    }
-  }
-  if (num_markers != 1)
-    KALDI_ERR << "Expected exactly one kNoOperationMarker marker.";
-
-  for (int32 c = 0; c < num_commands; c++) {
-    CommandType command_type =
-        computation_.commands[c].command_type;
-    if (c != marker_location &&
-        command_type == kNoOperationMarker)
-      KALDI_ERR << "Found kNoOpMarker in unexpected place";
-    if (c < marker_location &&
-        (command_type == kBackprop ||
-         command_type == kBackpropNoModelUpdate))
-      KALDI_ERR << "Backprop occurs before kNoOpMarker";
-    if (c > marker_location &&
-        command_type == kPropagate)
-      KALDI_ERR << "Propagate occurs after kNoOpMarker";
-    if (c > marker_location &&
-        command_type == kStoreStats)
-      KALDI_ERR << "StoreStats occurs after kNoOpMarker";
-  }
-}
-
 void ComputationChecker::CheckComputationDebugInfo() const {
   if (computation_.matrix_debug_info.empty()) return;
   if (computation_.matrix_debug_info.size() !=
@@ -980,16 +965,57 @@ void ComputationChecker::CheckComputationDebugInfo() const {
   }
 }
 
-void CheckComputation(const Nnet &nnet,
-                      const ComputationRequest &request,
-                      const NnetComputation &computation,
-                      bool check_rewrite) {
+
+// note: 'computation' is not a reference, it's copied so that we
+// can modify it internally.
+static void CheckComputationOnline(const Nnet &nnet,
+                                   NnetComputation computation,
+                                   bool check_rewrite) {
+  int32 num_commands = computation.commands.size();
+  KALDI_ASSERT(computation.commands[num_commands-1].command_type == kGotoLabel);
+  for (int32 c = num_commands - 2;
+       c >= 0 && computation.commands[c].command_type == kAllocMatrixFromOther;
+       c--) {
+    // this command can be interpreted as "initialize matrix referred to by
+    // c.arg2 with the matrix referred to by c.arg2".
+    // Because this would be interpreted by the analysis code as initializing a
+    // matrix that has already been initialized, we turn this into a command
+    // that just deallocates the matrix in c.arg2. [note: all these indexes
+    // are actually submatrix indexes].
+    computation.commands[c].command_type = kDeallocMatrix;
+    std::swap(computation.commands[c].arg1, computation.commands[c].arg2);
+  }
+
   CheckComputationOptions opts;
   opts.check_rewrite = check_rewrite;
+  opts.check_unused_variables = false;
+  // We can always do this check with online computations, since they do not
+  // have the RemoveUnnecessaryAllocation() optimization applied.
   ComputationChecker checker(opts, nnet, computation);
   checker.Check();
 }
 
+void CheckComputation(const Nnet &nnet,
+                      const NnetComputation &computation,
+                      bool check_rewrite) {
+  try {
+    if (!computation.commands.empty() &&
+        computation.commands.back().command_type == kGotoLabel) {
+      // Online computations need to be treated specially.
+      CheckComputationOnline(nnet, computation, check_rewrite);
+    } else {
+      CheckComputationOptions opts;
+      opts.check_rewrite = check_rewrite;
+      ComputationChecker checker(opts, nnet, computation);
+      checker.Check();
+    }
+  } catch (...) {
+    computation.Print(std::cerr, nnet);
+    KALDI_ERR << "Computation check failed for computation printed above "
+        "(actual error message is above computation)";
+  }
+}
+
 void ComputeMatrixToSubmatrix(
     const NnetComputation &computation,
     std::vector<std::vector<int32> > *mat_to_submat) {
@@ -1008,9 +1034,6 @@ void ComputeMatrixToSubmatrix(
 
 int32 ComputationAnalysis::FirstAccess(int32 s) const {
   KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
-  int32 matrix_index = computation_.submatrices[s].matrix_index;
-  if (analyzer_.matrix_accesses[matrix_index].is_input)
-    return -1;
   int32 ans = computation_.commands.size();
   std::vector<int32> variable_indexes;
   analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
@@ -1042,8 +1065,6 @@ int32 ComputationAnalysis::FirstAccess(int32 s) const {
 
 int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
   KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
-  if (analyzer_.matrix_accesses[m].is_input)
-    return -1;
   int32 ans = computation_.commands.size();
   const std::vector<Access> &accesses =
       analyzer_.matrix_accesses[m].accesses;
@@ -1051,7 +1072,12 @@ int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
       access_end = accesses.end();
   for (; access_iter != access_end; ++access_iter) {
     int32 command_index = access_iter->command_index;
-    if (command_index != analyzer_.matrix_accesses[m].allocate_command) {
+    CommandType command_type =
+        computation_.commands[command_index].command_type;
+    if (command_type != kAllocMatrixUndefined &&
+        command_type != kAllocMatrixZeroed &&
+        command_type != kAllocMatrixFromOther &&
+        command_type != kAllocMatrixFromOtherZeroed) {
       ans = std::min(ans, command_index);
       break;  // break from access_iter loop (an optimization)
     }
@@ -1062,8 +1088,6 @@ int32 ComputationAnalysis::FirstMatrixAccess(int32 m) const {
 
 int32 ComputationAnalysis::LastMatrixAccess(int32 m) const {
   KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
-  if (analyzer_.matrix_accesses[m].is_output)
-    return computation_.commands.size();
   int32 ans = -1;
   const std::vector<Access> &accesses =
       analyzer_.matrix_accesses[m].accesses;
@@ -1080,9 +1104,6 @@ int32 ComputationAnalysis::LastMatrixAccess(int32 m) const {
 
 int32 ComputationAnalysis::LastAccess(int32 s) const {
   KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
-  int32 matrix_index = computation_.submatrices[s].matrix_index;
-  if (analyzer_.matrix_accesses[matrix_index].is_output)
-    return computation_.commands.size();
   int32 ans = -1;
   std::vector<int32> variable_indexes;
   analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
@@ -1246,5 +1267,14 @@ void Analyzer::Init(const Nnet &nnet, const NnetComputation &computation) {
                         &matrix_accesses);
 }
 
+void GetSegmentEnds(const NnetComputation &computation,
+                    std::vector<int32> *command_indexes) {
+  int32 num_commands = computation.commands.size();
+  command_indexes->clear();
+  for (int32 c = 0; c < num_commands; c++)
+    if (computation.commands[c].command_type == kNoOperationMarker)
+      command_indexes->push_back(c);
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 28a62e996b8..4a827c05eb0 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -145,6 +145,7 @@ class ComputationVariables {
       int32 matrix_index,
       std::vector<int32> *variable_indexes) const;
 
+
   // Appends to variable_indexes the sorted list of variables corresponding to a
   // submatrix index.
   void AppendVariablesForSubmatrix(
@@ -311,23 +312,20 @@ class ComputationAnalysis {
                       const Analyzer &analyzer): computation_(computation),
                                                  analyzer_(analyzer) { }
 
-  /// If the matrix underlying submatrix 's' is an input then this returns -1;
-  /// otherwise it returns the first command (read or write) that is not an
-  /// allocation command, that accesses any part of 's' [note: deallocation does
-  /// not count as a read or write operation].  If there is no such command, it
-  /// returns num_commands.
+  /// Returns the first command (read or write) that is not a kAlloc* command,
+  /// that accesses any part of 's' [note: deallocation does not count as a read
+  /// or write operation].  If there is no such command, it returns
+  /// num_commands.
   /// s must be >0 (i.e. not the empty submatrix).
   int32 FirstAccess(int32 s) const;
 
-  /// If the matrix underlying submatrix 's' is an output then this returns
-  /// num-commands; otherwise it returns the last non-deallocation command
-  /// that accesses any part of submatrix 's'; if there is no such command it
-  /// returns -1.
+  /// Returns the last non-deallocation command that accesses any part of
+  /// submatrix 's'; if there is no such command it returns -1.
   /// s must be >0 (i.e. not the empty submatrix).
   int32 LastAccess(int32 s) const;
 
   /// Returns the last command-index that accesses any part of submatrix 's' as
-  /// a write operation, or -1 if there is no such operation.  Not: deallocation
+  /// a write operation, or -1 if there is no such operation.  Note: deallocation
   /// does not count as a write operation.
   /// s must be >0 (i.e. not the empty submatrix).
   int32 LastWriteAccess(int32 s) const;
@@ -339,16 +337,13 @@ class ComputationAnalysis {
   /// s must be >0 (i.e. not the empty submatrix).
   int32 DataInvalidatedCommand(int32 c, int32 s) const;
 
-  /// If matrix 'm' is an input then this returns -1; otherwise it returns the
-  /// first command (read or write) that is not an allocation command, that
-  /// accesses any part of 'm' [note: deallocation does not count as a read or
-  /// write operation].  If there is no such command, it returns num_commands.
-  /// m must be >0 (i.e. not the empty matrix).
+  /// Returns the first command (read or write or accept-input) that is not an
+  /// kAllocate* command, that accesses any part of 'm' [note: deallocation does
+  /// not count as a read or write operation].  If there is no such command, it
+  /// returns num_commands.  m must be >0 (i.e. not the empty matrix).
   int32 FirstMatrixAccess(int32 m) const;
 
-
-  /// If matrix 'm' is an output then this returns num-commands; otherwise it
-  /// returns the last non-deallocation command that accesses any part of
+  /// Returns the last non-deallocation command that accesses any part of
   /// matrix 'm'; if there is no such command it returns -1.  m must be >0
   /// (i.e. not the empty matrix).
   int32 LastMatrixAccess(int32 m) const;
@@ -386,11 +381,22 @@ struct CheckComputationOptions {
   // do the check_rewrite check only for a non-optimized computation, it may
   // legitimately fail after optimization.  see code for details.
   bool check_rewrite;
-
-  CheckComputationOptions(): check_rewrite(false) { }
+  // If 'check_unused_variables' is true, it checks for unused variables
+  // (e.g. unused partsof matrices).  We only set it false for online
+  // computations, where there can be instances where a part of a matrix is
+  // apparently never accessed (until we consider that the matrix is swapped
+  // with another).
+  bool check_unused_variables;
+
+  CheckComputationOptions():
+      check_rewrite(false), check_unused_variables(true) { }
 };
 
 
+// Note: this checker class does not work for online computations (that have a
+// kGoto statement), but the function CheckComputation() is able to detect such
+// computations and modify them in such a way that they can be checked by this
+// class (and then do extra checks).
 class ComputationChecker {
  public:
   ComputationChecker(const CheckComputationOptions &config,
@@ -400,10 +406,6 @@ class ComputationChecker {
  private:
   // various dimension consistency checks and checks on properties.
   void CheckComputationIndexes() const;
-  // make sure Propagate comes before kNoOpMarker and Backprop comes after it,
-  // and that the value of forward_computation_end matches the position of
-  // kNoOpMarker.
-  void CheckComputationOrder() const;
   // checks for a situation where an undefined variable is read.
   void CheckComputationUndefined() const;
   // checks that all writes are done before reads.  details with implementation.
@@ -421,10 +423,19 @@ class ComputationChecker {
 };
 
 
+/// This utility function works out from a computation, the locations of the
+/// 'segment ends'.  This is useful for online compilation, where the
+/// computation has multiple segments corresponding to new pieces of input data
+/// to process.  The implementation of the function is extremely simple; it
+/// just gives you the locations of commands of type 'kNoOperationMarker'.
+void GetSegmentEnds(const NnetComputation &computation,
+                    std::vector<int32> *command_indexes);
+
 /// This is a convenience interface for class ComputationChecker.  Call it with
-/// check_rewrite = true only if the optimization is pre-optimization.
+/// check_rewrite = true only if the computation is pre-optimization.
+/// If the computation is an 'online' computation, this function treats
+/// it specially.
 void CheckComputation(const Nnet &nnet,
-                      const ComputationRequest &request,
                       const NnetComputation &computation,
                       bool check_rewrite = false);
 
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
index dd9b99fe26d..c93858fb06e 100644
--- a/src/nnet3/nnet-chain-combine.cc
+++ b/src/nnet3/nnet-chain-combine.cc
@@ -38,7 +38,13 @@ NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
     nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+
+  if (combine_config_.sum_to_one_penalty != 0.0 &&
+      combine_config_.enforce_sum_to_one) {
+    KALDI_WARN << "--sum-to-one-penalty=" << combine_config_.sum_to_one_penalty
+              << " is nonzero, so setting --enforce-sum-to-one=false.";
+    combine_config_.enforce_sum_to_one = false;
+  }
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
@@ -133,12 +139,12 @@ void NnetChainCombiner::Combine() {
                          // itself, so this is BFGS.
   lbfgs_options.first_step_impr = combine_config_.initial_impr;
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  BaseFloat objf, initial_objf;
+  Vector<double> params(dim), deriv(dim);
+  double objf, initial_objf;
   GetInitialParameters(&params);
 
 
-  OptimizeLbfgs<BaseFloat> lbfgs(params, lbfgs_options);
+  OptimizeLbfgs<double> lbfgs(params, lbfgs_options);
 
   for (int32 i = 0; i < combine_config_.num_iters; i++) {
     params.CopyFromVec(lbfgs.GetProposedValue());
@@ -149,12 +155,25 @@ void NnetChainCombiner::Combine() {
     lbfgs.DoStep(objf, deriv);
   }
 
-  KALDI_LOG << "Combining nnets, objective function changed from "
-            << initial_objf << " to " << objf;
+  if (!combine_config_.sum_to_one_penalty) {
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf;
+  } else {
+    Vector<double> weights(WeightDim());
+    GetWeights(params, &weights);
+    bool print_weights = true;
+    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
+    // note: initial_objf has no penalty term because it summed exactly
+    // to one.
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf << " = "
+              << (objf - penalty) << " + " << penalty;
+  }
+
 
   // must recompute nnet_ if "params" is not exactly equal to the
   // final params that LB
-  Vector<BaseFloat> final_params(dim);
+  Vector<double> final_params(dim);
   final_params.CopyFromVec(lbfgs.GetValue(&objf));
   if (!params.ApproxEqual(final_params, 0.0)) {
     // the following call makes sure that nnet_ corresponds to the parameters
@@ -165,9 +184,8 @@ void NnetChainCombiner::Combine() {
 }
 
 
-void NnetChainCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
-
-  Vector<BaseFloat> weights(params.Dim()), normalized_weights(params.Dim());
+void NnetChainCombiner::PrintParams(const VectorBase<double> &params) const {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),
@@ -217,21 +235,21 @@ void NnetChainCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
 bool NnetChainCombiner::SelfTestDerivatives() {
   int32 num_tests = 2;  // more properly, this is the number of dimensions in a
                         // single test.
-  BaseFloat delta = 0.001;
+  double delta = 0.001;
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  BaseFloat initial_objf = ComputeObjfAndDerivFromParameters(params,
+  double initial_objf = ComputeObjfAndDerivFromParameters(params,
                                                              &deriv);
   for (int32 i = 0; i < num_tests; i++) {
-    Vector<BaseFloat> new_deriv(dim), offset(dim), new_params(params);
+    Vector<double> new_deriv(dim), offset(dim), new_params(params);
     offset.SetRandn();
     new_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromParameters(new_params,
+    double new_objf = ComputeObjfAndDerivFromParameters(new_params,
                                                            &new_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -239,7 +257,7 @@ bool NnetChainCombiner::SelfTestDerivatives() {
         0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "predicted_changes = " << predicted_changes;
   KALDI_LOG << "observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
@@ -256,23 +274,23 @@ void NnetChainCombiner::SelfTestModelDerivatives() {
                         // single test.
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
+  Vector<BaseFloat> nnet_params(NnetParameterDim(), kUndefined),
       nnet_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
 
-  BaseFloat initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
+  double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
                                                        &nnet_deriv);
 
-  BaseFloat delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
-                                      NnetParameterDim());
+  double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
+                                   NnetParameterDim());
 
 
   for (int32 i = 0; i < num_tests; i++) {
@@ -280,7 +298,7 @@ void NnetChainCombiner::SelfTestModelDerivatives() {
         offset(NnetParameterDim()), new_nnet_params(nnet_params);
     offset.SetRandn();
     new_nnet_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
+    double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
                                                      &new_nnet_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -290,7 +308,7 @@ void NnetChainCombiner::SelfTestModelDerivatives() {
         0.5 * VecVec(nnet_params, new_nnet_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
   KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold))
@@ -308,7 +326,7 @@ int32 NnetChainCombiner::ParameterDim() const {
 }
 
 
-void NnetChainCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
+void NnetChainCombiner::GetInitialParameters(VectorBase<double> *params) const {
   KALDI_ASSERT(params->Dim() == ParameterDim());
   params->Set(1.0 / nnet_params_.NumRows());
   if (combine_config_.enforce_positive_weights) {
@@ -318,8 +336,8 @@ void NnetChainCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) cons
   }
 }
 
-void NnetChainCombiner::GetWeights(const VectorBase<BaseFloat> &params,
-                              VectorBase<BaseFloat> *weights) const {
+void NnetChainCombiner::GetWeights(const VectorBase<double> &params,
+                              VectorBase<double> *weights) const {
   KALDI_ASSERT(weights->Dim() == WeightDim());
   if (combine_config_.separate_weights_per_component) {
     weights->CopyFromVec(params);
@@ -339,12 +357,12 @@ void NnetChainCombiner::GetWeights(const VectorBase<BaseFloat> &params,
 }
 
 
-void NnetChainCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                                  const VectorBase<BaseFloat> &weights_deriv,
-                                  VectorBase<BaseFloat> *param_deriv) {
+void NnetChainCombiner::GetParamsDeriv(const VectorBase<double> &weights,
+                                  const VectorBase<double> &weights_deriv,
+                                  VectorBase<double> *param_deriv) {
   KALDI_ASSERT(weights.Dim() == WeightDim() &&
                param_deriv->Dim() == ParameterDim());
-  Vector<BaseFloat> preexp_weights_deriv(weights_deriv);
+  Vector<double> preexp_weights_deriv(weights_deriv);
   if (combine_config_.enforce_positive_weights) {
     // to enforce positive weights we first compute weights (call these
     // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
@@ -363,8 +381,67 @@ void NnetChainCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
   }
 }
 
+double NnetChainCombiner::GetSumToOnePenalty(
+    const VectorBase<double> &weights,
+    VectorBase<double> *weights_penalty_deriv,
+    bool print_weights) const {
+
+  KALDI_ASSERT(combine_config_.sum_to_one_penalty >= 0.0);
+  double penalty = combine_config_.sum_to_one_penalty;
+  if (penalty == 0.0) {
+    weights_penalty_deriv->SetZero();
+    return 0.0;
+  }
+  double ans = 0.0;
+  int32 num_uc = NumUpdatableComponents(),
+    num_models = nnet_params_.NumRows();
+  Vector<double> tot_weights(num_uc);
+  std::ostringstream tot_weight_info;
+  for (int32 c = 0; c < num_uc; c++) {
+    double this_total_weight = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      double this_weight = weights(index);
+      this_total_weight += this_weight;
+    }
+    tot_weights(c) = this_total_weight;
+    // this_total_weight_deriv is the derivative of the penalty
+    // term w.r.t. this component's total weight.
+    double this_total_weight_deriv;
+    if (combine_config_.enforce_positive_weights) {
+      // if combine_config_.enforce_positive_weights is true, then we choose to
+      // formulate the penalty in a slightly different way.. this solves the
+      // problem that with the formulation in the 'else' below, if for some
+      // reason the total weight is << 1.0, the deriv w.r.t. the actual
+      // parameters gets tiny [because weight = exp(params)].
+      double log_total = log(this_total_weight);
+      ans += -0.5 * penalty * log_total * log_total;
+      double log_total_deriv = -1.0 * penalty * log_total;
+      this_total_weight_deriv = log_total_deriv / this_total_weight;
+    } else {
+      ans += -0.5 * penalty *
+             (this_total_weight - 1.0) * (this_total_weight - 1.0);
+      this_total_weight_deriv = penalty * (1.0 - this_total_weight);
+
+    }
+    if (weights_penalty_deriv != NULL) {
+      KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
+      for (int32 m = 0; m < num_models; m++) {
+        int32 index = m * num_uc + c;
+        (*weights_penalty_deriv)(index) = this_total_weight_deriv;
+      }
+    }
+  }
+  if (print_weights) {
+    Vector<BaseFloat> tot_weights_float(tot_weights);
+    KALDI_LOG << "Total weights per component: "
+              << PrintVectorPerUpdatableComponent(nnet_,
+                                                  tot_weights_float);
+  }
+  return ans;
+}
 
-void NnetChainCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
+void NnetChainCombiner::GetNnetParameters(const Vector<double> &weights,
                                      VectorBase<BaseFloat> *nnet_params) const {
   KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
   nnet_params->SetZero();
@@ -390,7 +467,7 @@ void NnetChainCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
 // compare GetNnetParameters.
 void NnetChainCombiner::GetWeightsDeriv(
     const VectorBase<BaseFloat> &nnet_params_deriv,
-    VectorBase<BaseFloat> *weights_deriv) {
+    VectorBase<double> *weights_deriv) {
   KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
                weights_deriv->Dim() == WeightDim());
   int32 num_uc = NumUpdatableComponents(),
@@ -442,30 +519,35 @@ double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
 
 
 double NnetChainCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<BaseFloat> &params,
-    VectorBase<BaseFloat> *params_deriv) {
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined),
+    VectorBase<double> &params,
+    VectorBase<double> *params_deriv) {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
+      weights_sum_to_one_penalty_deriv(WeightDim()),
       normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
+  Vector<BaseFloat>
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_params_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
+  double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
-  double ans = ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
+  ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
   if (ans != ans || ans - ans != 0) // NaN or inf
     return ans;  // No point computing derivative
   GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
   GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
                               &weights_deriv);
+  weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv);
   GetParamsDeriv(weights, weights_deriv, params_deriv);
   return ans;
 }
 
 
-// enforces the constraint that the weights for each component must sum to one.
+// enforces the constraint that the weights for each component must sum to one,
+// if necessary.
 void NnetChainCombiner::GetNormalizedWeights(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    VectorBase<BaseFloat> *norm_weights) const {
+    const VectorBase<double> &unnorm_weights,
+    VectorBase<double> *norm_weights) const {
   if (!combine_config_.enforce_sum_to_one) {
     norm_weights->CopyFromVec(unnorm_weights);
     return;
@@ -473,12 +555,12 @@ void NnetChainCombiner::GetNormalizedWeights(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
+    double inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
                                     // weights and eventually -inf objective.
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
@@ -488,9 +570,9 @@ void NnetChainCombiner::GetNormalizedWeights(
 }
 
 void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    const VectorBase<BaseFloat> &norm_weights_deriv,
-    VectorBase<BaseFloat> *unnorm_weights_deriv) {
+    const VectorBase<double> &unnorm_weights,
+    const VectorBase<double> &norm_weights_deriv,
+    VectorBase<double> *unnorm_weights_deriv) {
   if (!combine_config_.enforce_sum_to_one) {
     unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
     return;
@@ -498,13 +580,13 @@ void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;
-    BaseFloat inv_sum_deriv = 0.0;
+    double inv_sum = 1.0 / sum;
+    double inv_sum_deriv = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       // in the forward direction, we'd do:
@@ -513,7 +595,7 @@ void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
       inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
     }
     // note: d/dx (1/x) = -1/x^2
-    BaseFloat sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
+    double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       (*unnorm_weights_deriv)(index) += sum_deriv;
diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h
index 6ef882ecc38..3aeb3882650 100644
--- a/src/nnet3/nnet-chain-combine.h
+++ b/src/nnet3/nnet-chain-combine.h
@@ -62,7 +62,7 @@ class NnetChainCombiner {
 
   ~NnetChainCombiner() { delete prob_computer_; }
  private:
-  const NnetCombineConfig &combine_config_;
+  NnetCombineConfig combine_config_;
   const chain::ChainTrainingOptions &chain_config_;
 
   const std::vector<NnetChainExample> &egs_;
@@ -87,8 +87,9 @@ class NnetChainCombiner {
   Matrix<BaseFloat> nnet_params_;
 
   // This vector has the same dimension as nnet_params_.NumRows(),
-  // and helps us normalize so each row of nnet_params correspondss to
-  // a weighted average of its inputs.
+  // and helps us normalize so each row of nnet_params corresponds to
+  // a weighted average of its inputs (will be all ones if
+  // config_.max_effective_inputs >= the number of nnets provided).
   Vector<BaseFloat> tot_input_weighting_;
 
   // returns the parameter dimension, i.e. the dimension of the parameters that
@@ -110,7 +111,7 @@ class NnetChainCombiner {
   // Computes the initial parameters.  The parameters are the underlying thing
   // that we optimize; their dimension equals ParameterDim().  They are not the same
   // thing as the nnet parameters.
-  void GetInitialParameters(VectorBase<BaseFloat> *params) const;
+  void GetInitialParameters(VectorBase<double> *params) const;
 
   // Tests that derivatives are accurate.  Prints warning and returns false if not.
   bool SelfTestDerivatives();
@@ -120,33 +121,48 @@ class NnetChainCombiner {
 
 
   // prints the parameters via logging statements.
-  void PrintParams(const VectorBase<BaseFloat> &params) const;
+  void PrintParams(const VectorBase<double> &params) const;
 
   // This function computes the objective function (and its derivative, if the objective
   // function is finite) at the given value of the parameters (the parameters we're optimizing,
   // i.e. the combination weights; not the nnet parameters.  This function calls most of the
   // functions below.
   double ComputeObjfAndDerivFromParameters(
-      VectorBase<BaseFloat> &params,
-      VectorBase<BaseFloat> *params_deriv);
+      VectorBase<double> &params,
+      VectorBase<double> *params_deriv);
 
 
   // Computes the weights from the parameters in a config-dependent way.  The
   // weight dimension is always (the number of updatable components times
   // nnet_params_.NumRows()).
-  void GetWeights(const VectorBase<BaseFloat> &params,
-                  VectorBase<BaseFloat> *weights) const;
+  void GetWeights(const VectorBase<double> &params,
+                  VectorBase<double> *weights) const;
 
   // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
   // with sum-to-one constrint per component included; else just copy input to
   // output.
-  void GetNormalizedWeights(const VectorBase<BaseFloat> &unnorm_weights,
-                            VectorBase<BaseFloat> *norm_weights) const;
+  void GetNormalizedWeights(const VectorBase<double> &unnorm_weights,
+                            VectorBase<double> *norm_weights) const;
+
+  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
+  // weights_penalty_deriv to 0.0; else it computes, for each
+  // updatable component u the total weight w_u, returns the value
+  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
+  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
+  // the result.
+  // Note: config_.sum_to_one_penalty is exclusive with
+  // config_.enforce_sum_to_one, so there is really no distinction between
+  // normalized and unnormalized weights here (since normalization would be a
+  // no-op).
+  double GetSumToOnePenalty(const VectorBase<double> &weights,
+                            VectorBase<double> *weights_penalty_deriv,
+                            bool print_weights = false) const;
+
 
   // Computes the nnet-parameter vector from the normalized weights and
   // nnet_params_, as a vector.  (See the functions Vectorize() and
   // UnVectorize() for how they relate to the nnet's components' parameters).
-  void GetNnetParameters(const Vector<BaseFloat> &normalized_weights,
+  void GetNnetParameters(const Vector<double> &normalized_weights,
                          VectorBase<BaseFloat> *nnet_params) const;
 
   // This function computes the objective function (and its derivative, if the objective
@@ -158,23 +174,23 @@ class NnetChainCombiner {
   // Given an objective-function derivative with respect to the nnet parameters,
   // computes the derivative with respect to the (normalized) weights.
   void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
-                       VectorBase<BaseFloat> *normalized_weights_deriv);
+                       VectorBase<double> *normalized_weights_deriv);
 
 
   // Computes the derivative w.r.t. the unnormalized weights, by propagating
   // through the normalization operation.
   // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
   // unnorm_weights_deriv.
-  void GetUnnormalizedWeightsDeriv(const VectorBase<BaseFloat> &unnorm_weights,
-                                   const VectorBase<BaseFloat> &norm_weights_deriv,
-                                   VectorBase<BaseFloat> *unnorm_weights_deriv);
+  void GetUnnormalizedWeightsDeriv(const VectorBase<double> &unnorm_weights,
+                                   const VectorBase<double> &norm_weights_deriv,
+                                   VectorBase<double> *unnorm_weights_deriv);
 
 
   // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
   // the params
-  void GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                      const VectorBase<BaseFloat> &weight_deriv,
-                      VectorBase<BaseFloat> *param_deriv);
+  void GetParamsDeriv(const VectorBase<double> &weights,
+                      const VectorBase<double> &weight_deriv,
+                      VectorBase<double> *param_deriv);
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 46e2b0c01dc..76abc5ce154 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -32,13 +32,13 @@ NnetChainComputeProb::NnetChainComputeProb(
     chain_config_(chain_config),
     den_graph_(den_fst, nnet.OutputDim("output")),
     nnet_(nnet),
-    compiler_(nnet, nnet_config_.optimize_config),
+    compiler_(nnet, nnet_config_.optimize_config, nnet_config_.compiler_config),
     deriv_nnet_(NULL),
     num_minibatches_processed_(0) {
   if (nnet_config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
-    bool is_gradient = true;  // force simple update
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_); // force simple update
   }
 }
 
@@ -56,8 +56,8 @@ void NnetChainComputeProb::Reset() {
   num_minibatches_processed_ = 0;
   objf_info_.clear();
   if (deriv_nnet_) {
-    bool is_gradient = true;
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
   }
 }
 
@@ -82,10 +82,10 @@ void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) {
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(nnet_, chain_eg.inputs);
-  computer.Forward();
+  computer.Run();
   this->ProcessOutputs(chain_eg, &computer);
   if (nnet_config_.compute_deriv)
-    computer.Backward();
+    computer.Run();
 }
 
 void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
@@ -111,15 +111,15 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     if (use_xent)
       xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                         kUndefined);
-      
+
     BaseFloat tot_like, tot_l2_term, tot_weight;
-    
+
     ComputeChainObjfAndDeriv(chain_config_, den_graph_,
                              sup.supervision, nnet_output,
                              &tot_like, &tot_l2_term, &tot_weight,
                              (nnet_config_.compute_deriv ? &nnet_output_deriv :
                               NULL), (use_xent ? &xent_deriv : NULL));
-    
+
     // note: in this context we don't want to apply 'sup.deriv_weights' because
     // this code is used only in combination, where it's part of an L-BFGS
     // optimization algorithm, and in that case if there is a mismatch between
@@ -134,7 +134,7 @@ void NnetChainComputeProb::ProcessOutputs(const NnetChainExample &eg,
     totals.tot_l2_term += tot_l2_term;
 
     if (nnet_config_.compute_deriv)
-      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+      computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     if (use_xent) {
       ChainObjectiveInfo &xent_totals = objf_info_[xent_name];
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 74e8be80240..351312fb952 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -105,7 +105,7 @@ void NnetChainSupervision::Swap(NnetChainSupervision *other) {
 NnetChainSupervision::NnetChainSupervision(
     const std::string &name,
     const chain::Supervision &supervision,
-    const Vector<BaseFloat> &deriv_weights,
+    const VectorBase<BaseFloat> &deriv_weights,
     int32 first_frame,
     int32 frame_skip):
     name(name),
@@ -207,8 +207,8 @@ static void MergeSupervision(
   std::vector<chain::Supervision> output_supervision;
   bool compactify = true;
   AppendSupervision(input_supervision,
-                         compactify,
-                         &output_supervision);
+                    compactify,
+                    &output_supervision);
   if (output_supervision.size() != 1)
     KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths "
               << "or weights?";
@@ -290,28 +290,6 @@ void MergeChainExamples(bool compress,
   }
 }
 
-void TruncateDerivWeights(int32 truncate,
-                          NnetChainExample *eg) {
-  for (size_t i = 0; i < eg->outputs.size(); i++) {
-    NnetChainSupervision &supervision = eg->outputs[i];
-    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
-    if (deriv_weights.Dim() == 0) {
-      deriv_weights.Resize(supervision.indexes.size());
-      deriv_weights.Set(1.0);
-    }
-    int32 num_sequences = supervision.supervision.num_sequences,
-       frames_per_sequence = supervision.supervision.frames_per_sequence;
-    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
-    for (int32 t = 0; t < truncate; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-    for (int32 t = frames_per_sequence - truncate;
-         t < frames_per_sequence; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-  }
-}
-
 void GetChainComputationRequest(const Nnet &nnet,
                                 const NnetChainExample &eg,
                                 bool need_model_derivative,
@@ -421,5 +399,161 @@ void ShiftChainExampleTimes(int32 frame_shift,
   }
 }
 
+
+size_t NnetChainExampleStructureHasher::operator () (
+    const NnetChainExample &eg) const noexcept {
+  // these numbers were chosen at random from a list of primes.
+  NnetIoStructureHasher io_hasher;
+  size_t size = eg.inputs.size(), ans = size * 35099;
+  for (size_t i = 0; i < size; i++)
+    ans = ans * 19157 + io_hasher(eg.inputs[i]);
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    const NnetChainSupervision &sup = eg.outputs[i];
+    StringHasher string_hasher;
+    IndexVectorHasher indexes_hasher;
+    ans = ans * 17957 +
+        string_hasher(sup.name) + indexes_hasher(sup.indexes);
+  }
+  return ans;
+}
+
+bool NnetChainExampleStructureCompare::operator () (
+    const NnetChainExample &a,
+    const NnetChainExample &b) const {
+  NnetIoStructureCompare io_compare;
+  if (a.inputs.size() != b.inputs.size() ||
+      a.outputs.size() != b.outputs.size())
+    return false;
+  size_t size = a.inputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (!io_compare(a.inputs[i], b.inputs[i]))
+      return false;
+  size = a.outputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (a.outputs[i].name != b.outputs[i].name ||
+        a.outputs[i].indexes != b.outputs[i].indexes)
+      return false;
+  return true;
+}
+
+
+int32 GetNnetChainExampleSize(const NnetChainExample &a) {
+  int32 ans = 0;
+  for (size_t i = 0; i < a.inputs.size(); i++) {
+    int32 s = a.inputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  for (size_t i = 0; i < a.outputs.size(); i++) {
+    int32 s = a.outputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  return ans;
+}
+
+
+ChainExampleMerger::ChainExampleMerger(const ExampleMergingConfig &config,
+                                       NnetChainExampleWriter *writer):
+    finished_(false), num_egs_written_(0),
+    config_(config), writer_(writer) { }
+
+
+void ChainExampleMerger::AcceptExample(NnetChainExample *eg) {
+  KALDI_ASSERT(!finished_);
+  // If an eg with the same structure as 'eg' is already a key in the
+  // map, it won't be replaced, but if it's new it will be made
+  // the key.  Also we remove the key before making the vector empty.
+  // This way we ensure that the eg in the key is always the first
+  // element of the vector.
+  std::vector<NnetChainExample*> &vec = eg_to_egs_[eg];
+  vec.push_back(eg);
+  int32 eg_size = GetNnetChainExampleSize(*eg),
+      num_available = vec.size();
+  bool input_ended = false;
+  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
+                                               input_ended);
+  if (minibatch_size != 0) {  // we need to write out a merged eg.
+    KALDI_ASSERT(minibatch_size == num_available);
+
+    std::vector<NnetChainExample*> vec_copy(vec);
+    eg_to_egs_.erase(eg);
+
+    // MergeChainExamples() expects a vector of NnetChainExample, not of pointers,
+    // so use swap to create that without doing any real work.
+    std::vector<NnetChainExample> egs_to_merge(minibatch_size);
+    for (int32 i = 0; i < minibatch_size; i++) {
+      egs_to_merge[i].Swap(vec_copy[i]);
+      delete vec_copy[i];  // we owned those pointers.
+    }
+    WriteMinibatch(&egs_to_merge);
+  }
+}
+
+void ChainExampleMerger::WriteMinibatch(
+    std::vector<NnetChainExample> *egs) {
+  KALDI_ASSERT(!egs->empty());
+  int32 eg_size = GetNnetChainExampleSize((*egs)[0]);
+  NnetChainExampleStructureHasher eg_hasher;
+  size_t structure_hash = eg_hasher((*egs)[0]);
+  int32 minibatch_size = egs->size();
+  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
+  NnetChainExample merged_eg;
+  MergeChainExamples(config_.compress, egs, &merged_eg);
+  std::ostringstream key;
+  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  writer_->Write(key.str(), merged_eg);
+}
+
+void ChainExampleMerger::Finish() {
+  if (finished_) return;  // already finished.
+  finished_ = true;
+
+  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
+  // iterator invalidation problems.
+  std::vector<std::vector<NnetChainExample*> > all_egs;
+  all_egs.reserve(eg_to_egs_.size());
+
+  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
+  for (; iter != end; ++iter)
+    all_egs.push_back(iter->second);
+  eg_to_egs_.clear();
+
+  for (size_t i = 0; i < all_egs.size(); i++) {
+    int32 minibatch_size;
+    std::vector<NnetChainExample*> &vec = all_egs[i];
+    KALDI_ASSERT(!vec.empty());
+    int32 eg_size = GetNnetChainExampleSize(*(vec[0]));
+    bool input_ended = true;
+    while (!vec.empty() &&
+           (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
+                                                   input_ended)) != 0) {
+      // MergeChainExamples() expects a vector of
+      // NnetChainExample, not of pointers, so use swap to create that
+      // without doing any real work.
+      std::vector<NnetChainExample> egs_to_merge(minibatch_size);
+      for (int32 i = 0; i < minibatch_size; i++) {
+        egs_to_merge[i].Swap(vec[i]);
+        delete vec[i];  // we owned those pointers.
+      }
+      vec.erase(vec.begin(), vec.begin() + minibatch_size);
+      WriteMinibatch(&egs_to_merge);
+    }
+    if (!vec.empty()) {
+      int32 eg_size = GetNnetChainExampleSize(*(vec[0]));
+      NnetChainExampleStructureHasher eg_hasher;
+      size_t structure_hash = eg_hasher(*(vec[0]));
+      int32 num_discarded = vec.size();
+      stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
+      for (int32 i = 0; i < num_discarded; i++)
+        delete vec[i];
+      vec.clear();
+    }
+  }
+  stats_.PrintStats();
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 323e73da8da..2718af746b2 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -25,6 +25,7 @@
 #include "hmm/posterior.h"
 #include "util/table-types.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 #include "chain/chain-supervision.h"
 
 namespace kaldi {
@@ -83,7 +84,7 @@ struct NnetChainSupervision {
   /// is slower than the input, so in this case it might be 2 or 3.
   NnetChainSupervision(const std::string &name,
                        const chain::Supervision &supervision,
-                       const Vector<BaseFloat> &deriv_weights,
+                       const VectorBase<BaseFloat> &deriv_weights,
                        int32 first_frame,
                        int32 frame_skip);
 
@@ -130,6 +131,31 @@ struct NnetChainExample {
   }
 };
 
+/// This hashing object hashes just the structural aspects of the NnetExample
+/// without looking at the value of the features.  It will be used in combining
+/// egs into batches of all similar structure.
+struct NnetChainExampleStructureHasher {
+  size_t operator () (const NnetChainExample &eg) const noexcept;
+  // We also provide a version of this that works from pointers.
+  size_t operator () (const NnetChainExample *eg) const noexcept {
+    return (*this)(*eg);
+  }
+};
+
+
+/// This comparator object compares just the structural aspects of the
+/// NnetChainExample without looking at the value of the features.
+struct NnetChainExampleStructureCompare {
+  bool operator () (const NnetChainExample &a,
+                    const NnetChainExample &b) const;
+  // We also provide a version of this that works from pointers.
+  bool operator () (const NnetChainExample *a,
+                    const NnetChainExample *b) const {
+    return (*this)(*a, *b);
+  }
+};
+
+
 
 /// This function merges a list of NnetChainExample objects into a single one--
 /// intended to be used when forming minibatches for neural net training.  If
@@ -163,15 +189,6 @@ void ShiftChainExampleTimes(int32 frame_shift,
                            const std::vector<std::string> &exclude_names,
                            NnetChainExample *eg);
 
-/**
-   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
-   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
-   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
-   sequence).
- */
-void TruncateDerivWeights(int32 truncate,
-                          NnetChainExample *eg);
-
 /**  This function takes a NnetChainExample and produces a ComputationRequest.
      Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
      can create the ComputationRequest manually.  Assumes that if
@@ -200,6 +217,60 @@ typedef TableWriter<KaldiObjectHolder<NnetChainExample > > NnetChainExampleWrite
 typedef SequentialTableReader<KaldiObjectHolder<NnetChainExample > > SequentialNnetChainExampleReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetChainExample > > RandomAccessNnetChainExampleReader;
 
+
+/// This function returns the 'size' of a chain example as defined for purposes
+/// of merging egs, which is defined as the largest number of Indexes in any of
+/// the inputs or outputs of the example.
+int32 GetChainNnetExampleSize(const NnetChainExample &a);
+
+
+/// This class is responsible for arranging examples in groups that have the
+/// same strucure (i.e. the same input and output indexes), and outputting them
+/// in suitable minibatches as defined by ExampleMergingConfig.
+class ChainExampleMerger {
+ public:
+  ChainExampleMerger(const ExampleMergingConfig &config,
+                     NnetChainExampleWriter *writer);
+
+  // This function accepts an example, and if possible, writes a merged example
+  // out.  The ownership of the pointer 'a' is transferred to this class when
+  // you call this function.
+  void AcceptExample(NnetChainExample *a);
+
+  // This function announces to the class that the input has finished, so it
+  // should flush out any smaller-sized minibatches, as dictated by the config.
+  // This will be called in the destructor, but you can call it explicitly when
+  // all the input is done if you want to; it won't repeat anything if called
+  // twice.  It also prints the stats.
+  void Finish();
+
+  // returns a suitable exit status for a program.
+  int32 ExitStatus() { Finish(); return (num_egs_written_ > 0 ? 0 : 1); }
+
+  ~ChainExampleMerger() { Finish(); };
+ private:
+  // called by Finish() and AcceptExample().  Merges, updates the stats, and
+  // writes.  The 'egs' is non-const only because the egs are temporarily
+  // changed inside MergeChainEgs.  The pointer 'egs' is still owned
+  // by the caller.
+  void WriteMinibatch(std::vector<NnetChainExample> *egs);
+
+  bool finished_;
+  int32 num_egs_written_;
+  const ExampleMergingConfig &config_;
+  NnetChainExampleWriter *writer_;
+  ExampleMergingStats stats_;
+
+  // Note: the "key" into the egs is the first element of the vector.
+  typedef unordered_map<NnetChainExample*,
+                        std::vector<NnetChainExample*>,
+                        NnetChainExampleStructureHasher,
+                        NnetChainExampleStructureCompare> MapType;
+MapType eg_to_egs_;
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index d9d43006601..5fe28e8142b 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -30,16 +30,15 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     opts_(opts),
     den_graph_(den_fst, nnet->OutputDim("output")),
     nnet_(nnet),
-    compiler_(*nnet, opts_.nnet_config.optimize_config),
+    compiler_(*nnet, opts_.nnet_config.optimize_config,
+              opts_.nnet_config.compiler_config),
     num_minibatches_processed_(0) {
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
   KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
                opts.nnet_config.max_param_change >= 0.0);
   delta_nnet_ = nnet_->Copy();
-  bool is_gradient = false;  // setting this to true would disable the
-                             // natural-gradient updates.
-  SetZero(is_gradient, delta_nnet_);
+  ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
   num_max_change_per_component_applied_.resize(num_updatable, 0);
   num_max_change_global_applied_ = 0;
@@ -73,10 +72,10 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
                         *nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, chain_eg.inputs);
-  computer.Forward();
+  computer.Run();
 
   this->ProcessOutputs(chain_eg, &computer);
-  computer.Backward();
+  computer.Run();
 
   UpdateParamsWithMaxChange();
 }
@@ -134,7 +133,7 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
         xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
-    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
@@ -142,7 +141,7 @@ void NnetChainTrainer::ProcessOutputs(const NnetChainExample &eg,
 
     if (use_xent) {
       xent_deriv.Scale(opts_.chain_config.xent_regularize);
-      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+      computer->AcceptInput(xent_name, &xent_deriv);
     }
   }
 }
@@ -200,7 +199,7 @@ void NnetChainTrainer::UpdateParamsWithMaxChange() {
     if (param_delta > nnet_config.max_param_change) {
       if (param_delta - param_delta != 0.0) {
         KALDI_WARN << "Infinite parameter change, will not apply.";
-        SetZero(false, delta_nnet_);
+        ScaleNnet(0.0, delta_nnet_);
       } else {
         scale *= nnet_config.max_param_change / param_delta;
         num_max_change_global_applied_++;
@@ -232,7 +231,7 @@ void NnetChainTrainer::UpdateParamsWithMaxChange() {
 }
 
 bool NnetChainTrainer::PrintTotalStats() const {
-  unordered_map<std::string, ObjectiveFunctionInfo>::const_iterator
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher>::const_iterator
       iter = objf_info_.begin(),
       end = objf_info_.end();
   bool ans = false;
@@ -241,6 +240,7 @@ bool NnetChainTrainer::PrintTotalStats() const {
     const ObjectiveFunctionInfo &info = iter->second;
     ans = info.PrintTotalStats(name) || ans;
   }
+  PrintMaxChangeStats();
   return ans;
 }
 
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
index 07a96d143c2..a63e75f91c6 100644
--- a/src/nnet3/nnet-combine.cc
+++ b/src/nnet3/nnet-combine.cc
@@ -34,7 +34,13 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
     nnet_params_(std::min(num_nnets, config_.max_effective_inputs),
                  NumParameters(first_nnet)),
     tot_input_weighting_(nnet_params_.NumRows()) {
-  SetDropoutProportion(0, &nnet_);
+
+  if (config_.sum_to_one_penalty != 0.0 &&
+      config_.enforce_sum_to_one) {
+    KALDI_WARN << "--sum-to-one-penalty=" << config_.sum_to_one_penalty
+              << " is nonzero, so setting --enforce-sum-to-one=false.";
+    config_.enforce_sum_to_one = false;
+  }
   SubVector<BaseFloat> first_params(nnet_params_, 0);
   VectorizeNnet(nnet_, &first_params);
   tot_input_weighting_(0) += 1.0;
@@ -43,7 +49,6 @@ NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
   NnetComputeProbOptions compute_prob_opts;
   compute_prob_opts.compute_deriv = true;
   prob_computer_ = new NnetComputeProb(compute_prob_opts, nnet_);
-
 }
 
 void NnetCombiner::ComputeUpdatableComponentDims(){
@@ -130,12 +135,12 @@ void NnetCombiner::Combine() {
                          // itself, so this is BFGS.
   lbfgs_options.first_step_impr = config_.initial_impr;
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  BaseFloat objf, initial_objf;
+  Vector<double> params(dim), deriv(dim);
+  double objf, initial_objf;
   GetInitialParameters(&params);
 
 
-  OptimizeLbfgs<BaseFloat> lbfgs(params, lbfgs_options);
+  OptimizeLbfgs<double> lbfgs(params, lbfgs_options);
 
   for (int32 i = 0; i < config_.num_iters; i++) {
     params.CopyFromVec(lbfgs.GetProposedValue());
@@ -146,12 +151,25 @@ void NnetCombiner::Combine() {
     lbfgs.DoStep(objf, deriv);
   }
 
-  KALDI_LOG << "Combining nnets, objective function changed from "
-            << initial_objf << " to " << objf;
+  if (!config_.sum_to_one_penalty) {
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf;
+  } else {
+    Vector<double> weights(WeightDim());
+    GetWeights(params, &weights);
+    bool print_weights = true;
+    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
+    // note: initial_objf has no penalty term because it summed exactly
+    // to one.
+    KALDI_LOG << "Combining nnets, objective function changed from "
+              << initial_objf << " to " << objf << " = "
+              << (objf - penalty) << " + " << penalty;
+  }
+
 
   // must recompute nnet_ if "params" is not exactly equal to the
   // final params that LB
-  Vector<BaseFloat> final_params(dim);
+  Vector<double> final_params(dim);
   final_params.CopyFromVec(lbfgs.GetValue(&objf));
   if (!params.ApproxEqual(final_params, 0.0)) {
     // the following call makes sure that nnet_ corresponds to the parameters
@@ -162,9 +180,8 @@ void NnetCombiner::Combine() {
 }
 
 
-void NnetCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
-
-  Vector<BaseFloat> weights(params.Dim()), normalized_weights(params.Dim());
+void NnetCombiner::PrintParams(const VectorBase<double> &params) const {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   int32 num_models = nnet_params_.NumRows(),
@@ -214,21 +231,21 @@ void NnetCombiner::PrintParams(const VectorBase<BaseFloat> &params) const {
 bool NnetCombiner::SelfTestDerivatives() {
   int32 num_tests = 2;  // more properly, this is the number of dimensions in a
                         // single test.
-  BaseFloat delta = 0.001;
+  double delta = 0.001;
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  BaseFloat initial_objf = ComputeObjfAndDerivFromParameters(params,
+  double initial_objf = ComputeObjfAndDerivFromParameters(params,
                                                              &deriv);
   for (int32 i = 0; i < num_tests; i++) {
-    Vector<BaseFloat> new_deriv(dim), offset(dim), new_params(params);
+    Vector<double> new_deriv(dim), offset(dim), new_params(params);
     offset.SetRandn();
     new_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromParameters(new_params,
+    double new_objf = ComputeObjfAndDerivFromParameters(new_params,
                                                            &new_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -236,7 +253,7 @@ bool NnetCombiner::SelfTestDerivatives() {
         0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "predicted_changes = " << predicted_changes;
   KALDI_LOG << "observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
@@ -253,23 +270,23 @@ void NnetCombiner::SelfTestModelDerivatives() {
                         // single test.
   int32 dim = ParameterDim();
 
-  Vector<BaseFloat> params(dim), deriv(dim);
-  Vector<BaseFloat> predicted_changes(num_tests),
+  Vector<double> params(dim), deriv(dim);
+  Vector<double> predicted_changes(num_tests),
       observed_changes(num_tests);
 
   GetInitialParameters(&params);
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
+  Vector<BaseFloat> nnet_params(NnetParameterDim(), kUndefined),
       nnet_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
 
-  BaseFloat initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
+  double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
                                                        &nnet_deriv);
 
-  BaseFloat delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
-                                      NnetParameterDim());
+  double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
+                                   NnetParameterDim());
 
 
   for (int32 i = 0; i < num_tests; i++) {
@@ -277,7 +294,7 @@ void NnetCombiner::SelfTestModelDerivatives() {
         offset(NnetParameterDim()), new_nnet_params(nnet_params);
     offset.SetRandn();
     new_nnet_params.AddVec(delta, offset);
-    BaseFloat new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
+    double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
                                                      &new_nnet_deriv);
     // for predicted changes, interpolate old and new derivs.
     predicted_changes(i) =
@@ -287,7 +304,7 @@ void NnetCombiner::SelfTestModelDerivatives() {
         0.5 * VecVec(nnet_params, new_nnet_deriv);
     observed_changes(i) = new_objf - initial_objf;
   }
-  BaseFloat threshold = 0.1;
+  double threshold = 0.1;
   KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
   KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
   if (!ApproxEqual(predicted_changes, observed_changes, threshold))
@@ -305,7 +322,7 @@ int32 NnetCombiner::ParameterDim() const {
 }
 
 
-void NnetCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
+void NnetCombiner::GetInitialParameters(VectorBase<double> *params) const {
   KALDI_ASSERT(params->Dim() == ParameterDim());
   params->Set(1.0 / nnet_params_.NumRows());
   if (config_.enforce_positive_weights) {
@@ -315,8 +332,8 @@ void NnetCombiner::GetInitialParameters(VectorBase<BaseFloat> *params) const {
   }
 }
 
-void NnetCombiner::GetWeights(const VectorBase<BaseFloat> &params,
-                              VectorBase<BaseFloat> *weights) const {
+void NnetCombiner::GetWeights(const VectorBase<double> &params,
+                              VectorBase<double> *weights) const {
   KALDI_ASSERT(weights->Dim() == WeightDim());
   if (config_.separate_weights_per_component) {
     weights->CopyFromVec(params);
@@ -336,12 +353,12 @@ void NnetCombiner::GetWeights(const VectorBase<BaseFloat> &params,
 }
 
 
-void NnetCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                                  const VectorBase<BaseFloat> &weights_deriv,
-                                  VectorBase<BaseFloat> *param_deriv) {
+void NnetCombiner::GetParamsDeriv(const VectorBase<double> &weights,
+                                  const VectorBase<double> &weights_deriv,
+                                  VectorBase<double> *param_deriv) {
   KALDI_ASSERT(weights.Dim() == WeightDim() &&
                param_deriv->Dim() == ParameterDim());
-  Vector<BaseFloat> preexp_weights_deriv(weights_deriv);
+  Vector<double> preexp_weights_deriv(weights_deriv);
   if (config_.enforce_positive_weights) {
     // to enforce positive weights we first compute weights (call these
     // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
@@ -361,7 +378,68 @@ void NnetCombiner::GetParamsDeriv(const VectorBase<BaseFloat> &weights,
 }
 
 
-void NnetCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
+double NnetCombiner::GetSumToOnePenalty(
+    const VectorBase<double> &weights,
+    VectorBase<double> *weights_penalty_deriv,
+    bool print_weights) const {
+
+  KALDI_ASSERT(config_.sum_to_one_penalty >= 0.0);
+  double penalty = config_.sum_to_one_penalty;
+  if (penalty == 0.0) {
+    weights_penalty_deriv->SetZero();
+    return 0.0;
+  }
+  double ans = 0.0;
+  int32 num_uc = NumUpdatableComponents(),
+    num_models = nnet_params_.NumRows();
+  Vector<double> tot_weights(num_uc);
+  std::ostringstream tot_weight_info;
+  for (int32 c = 0; c < num_uc; c++) {
+    double this_total_weight = 0.0;
+    for (int32 m = 0; m < num_models; m++) {
+      int32 index = m * num_uc + c;
+      double this_weight = weights(index);
+      this_total_weight += this_weight;
+    }
+    tot_weights(c) = this_total_weight;
+    // this_total_weight_deriv is the derivative of the penalty
+    // term w.r.t. this component's total weight.
+    double this_total_weight_deriv;
+    if (config_.enforce_positive_weights) {
+      // if config_.enforce_positive_weights is true, then we choose to
+      // formulate the penalty in a slightly different way.. this solves the
+      // problem that with the formulation in the 'else' below, if for some
+      // reason the total weight is << 1.0, the deriv w.r.t. the actual
+      // parameters gets tiny [because weight = exp(params)].
+      double log_total = log(this_total_weight);
+      ans += -0.5 * penalty * log_total * log_total;
+      double log_total_deriv = -1.0 * penalty * log_total;
+      this_total_weight_deriv = log_total_deriv / this_total_weight;
+    } else {
+      ans += -0.5 * penalty *
+             (this_total_weight - 1.0) * (this_total_weight - 1.0);
+      this_total_weight_deriv = penalty * (1.0 - this_total_weight);
+
+    }
+    if (weights_penalty_deriv != NULL) {
+      KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
+      for (int32 m = 0; m < num_models; m++) {
+        int32 index = m * num_uc + c;
+        (*weights_penalty_deriv)(index) = this_total_weight_deriv;
+      }
+    }
+  }
+  if (print_weights) {
+    Vector<BaseFloat> tot_weights_float(tot_weights);
+    KALDI_LOG << "Total weights per component: "
+              << PrintVectorPerUpdatableComponent(nnet_,
+                                                  tot_weights_float);
+  }
+  return ans;
+}
+
+
+void NnetCombiner::GetNnetParameters(const Vector<double> &weights,
                                      VectorBase<BaseFloat> *nnet_params) const {
   KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
   nnet_params->SetZero();
@@ -387,7 +465,7 @@ void NnetCombiner::GetNnetParameters(const Vector<BaseFloat> &weights,
 // compare GetNnetParameters.
 void NnetCombiner::GetWeightsDeriv(
     const VectorBase<BaseFloat> &nnet_params_deriv,
-    VectorBase<BaseFloat> *weights_deriv) {
+    VectorBase<double> *weights_deriv) {
   KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
                weights_deriv->Dim() == WeightDim());
   int32 num_uc = NumUpdatableComponents(),
@@ -438,30 +516,35 @@ double NnetCombiner::ComputeObjfAndDerivFromNnet(
 
 
 double NnetCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<BaseFloat> &params,
-    VectorBase<BaseFloat> *params_deriv) {
-  Vector<BaseFloat> weights(WeightDim()), normalized_weights(WeightDim()),
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined),
+    VectorBase<double> &params,
+    VectorBase<double> *params_deriv) {
+  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
+      weights_sum_to_one_penalty_deriv(WeightDim()),
       normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
+  Vector<BaseFloat>
+      nnet_params(NnetParameterDim(), kUndefined),
+      nnet_params_deriv(NnetParameterDim(), kUndefined);
   GetWeights(params, &weights);
+  double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv);
   GetNormalizedWeights(weights, &normalized_weights);
   GetNnetParameters(normalized_weights, &nnet_params);
-  double ans = ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
+  ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
   if (ans != ans || ans - ans != 0) // NaN or inf
     return ans;  // No point computing derivative
   GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
   GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
                               &weights_deriv);
+  weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv);
   GetParamsDeriv(weights, weights_deriv, params_deriv);
   return ans;
 }
 
 
-// enforces the constraint that the weights for each component must sum to one.
+// enforces the constraint that the weights for each component must sum to one,
+// if necessary.
 void NnetCombiner::GetNormalizedWeights(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    VectorBase<BaseFloat> *norm_weights) const {
+    const VectorBase<double> &unnorm_weights,
+    VectorBase<double> *norm_weights) const {
   if (!config_.enforce_sum_to_one) {
     norm_weights->CopyFromVec(unnorm_weights);
     return;
@@ -469,12 +552,12 @@ void NnetCombiner::GetNormalizedWeights(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
+    double inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
                                     // weights and eventually -inf objective.
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
@@ -484,9 +567,9 @@ void NnetCombiner::GetNormalizedWeights(
 }
 
 void NnetCombiner::GetUnnormalizedWeightsDeriv(
-    const VectorBase<BaseFloat> &unnorm_weights,
-    const VectorBase<BaseFloat> &norm_weights_deriv,
-    VectorBase<BaseFloat> *unnorm_weights_deriv) {
+    const VectorBase<double> &unnorm_weights,
+    const VectorBase<double> &norm_weights_deriv,
+    VectorBase<double> *unnorm_weights_deriv) {
   if (!config_.enforce_sum_to_one) {
     unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
     return;
@@ -494,13 +577,13 @@ void NnetCombiner::GetUnnormalizedWeightsDeriv(
   int32 num_uc = NumUpdatableComponents(),
       num_models = nnet_params_.NumRows();
   for (int32 c = 0; c < num_uc; c++) {
-    BaseFloat sum = 0.0;
+    double sum = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       sum += unnorm_weights(index);
     }
-    BaseFloat inv_sum = 1.0 / sum;
-    BaseFloat inv_sum_deriv = 0.0;
+    double inv_sum = 1.0 / sum;
+    double inv_sum_deriv = 0.0;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       // in the forward direction, we'd do:
@@ -509,7 +592,7 @@ void NnetCombiner::GetUnnormalizedWeightsDeriv(
       inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
     }
     // note: d/dx (1/x) = -1/x^2
-    BaseFloat sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
+    double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
     for (int32 m = 0; m < num_models; m++) {
       int32 index = m * num_uc + c;
       (*unnorm_weights_deriv)(index) += sum_deriv;
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
index a2883dab5b2..5b60d30b8ed 100644
--- a/src/nnet3/nnet-combine.h
+++ b/src/nnet3/nnet-combine.h
@@ -48,6 +48,7 @@ struct NnetCombineConfig {
   bool test_gradient;
   bool enforce_positive_weights;
   bool enforce_sum_to_one;
+  BaseFloat sum_to_one_penalty;
   bool separate_weights_per_component;
   NnetCombineConfig(): num_iters(60),
                        initial_impr(0.01),
@@ -55,6 +56,7 @@ struct NnetCombineConfig {
                        test_gradient(false),
                        enforce_positive_weights(false),
                        enforce_sum_to_one(false),
+                       sum_to_one_penalty(0.0),
                        separate_weights_per_component(true) { }
 
   void Register(OptionsItf *po) {
@@ -73,6 +75,11 @@ struct NnetCombineConfig {
                  "If true, enforce that all weights are positive.");
     po->Register("enforce-sum-to-one", &enforce_sum_to_one, "If true, enforce that "
                  "the model weights for each component should sum to one.");
+    po->Register("sum-to-one-penalty", &sum_to_one_penalty, "If >0, a penalty term "
+                 "on the squared difference between sum(weights) for one component,"
+                 " and 1.0. This is like --enforce-sum-to-one, but done in a 'soft' "
+                 "way (e.g. maybe useful with dropout).  We suggest small values "
+                 "like 10e-3 (for regular nnets) or 1.0e-04 (for chain models).");
     po->Register("separate-weights-per-component", &separate_weights_per_component,
                  "If true, have a separate weight for each updatable component in "
                  "the nnet.");
@@ -104,7 +111,7 @@ class NnetCombiner {
 
   ~NnetCombiner() { delete prob_computer_; }
  private:
-  const NnetCombineConfig &config_;
+  NnetCombineConfig config_;
 
   const std::vector<NnetExample> &egs_;
 
@@ -126,8 +133,9 @@ class NnetCombiner {
   Matrix<BaseFloat> nnet_params_;
 
   // This vector has the same dimension as nnet_params_.NumRows(),
-  // and helps us normalize so each row of nnet_params correspondss to
-  // a weighted average of its inputs.
+  // and helps us normalize so each row of nnet_params corresponds to
+  // a weighted average of its inputs (will be all ones if
+  // config_.max_effective_inputs >= the number of nnets provided).
   Vector<BaseFloat> tot_input_weighting_;
 
   // returns the parameter dimension, i.e. the dimension of the parameters that
@@ -149,7 +157,7 @@ class NnetCombiner {
   // Computes the initial parameters.  The parameters are the underlying thing
   // that we optimize; their dimension equals ParameterDim().  They are not the same
   // thing as the nnet parameters.
-  void GetInitialParameters(VectorBase<BaseFloat> *params) const;
+  void GetInitialParameters(VectorBase<double> *params) const;
 
   // Tests that derivatives are accurate.  Prints warning and returns false if not.
   bool SelfTestDerivatives();
@@ -159,33 +167,48 @@ class NnetCombiner {
 
 
   // prints the parameters via logging statements.
-  void PrintParams(const VectorBase<BaseFloat> &params) const;
+  void PrintParams(const VectorBase<double> &params) const;
 
   // This function computes the objective function (and its derivative, if the objective
   // function is finite) at the given value of the parameters (the parameters we're optimizing,
   // i.e. the combination weights; not the nnet parameters.  This function calls most of the
   // functions below.
   double ComputeObjfAndDerivFromParameters(
-      VectorBase<BaseFloat> &params,
-      VectorBase<BaseFloat> *params_deriv);
+      VectorBase<double> &params,
+      VectorBase<double> *params_deriv);
 
 
   // Computes the weights from the parameters in a config-dependent way.  The
   // weight dimension is always (the number of updatable components times
   // nnet_params_.NumRows()).
-  void GetWeights(const VectorBase<BaseFloat> &params,
-                  VectorBase<BaseFloat> *weights) const;
+  void GetWeights(const VectorBase<double> &params,
+                  VectorBase<double> *weights) const;
 
   // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
   // with sum-to-one constrint per component included; else just copy input to
   // output.
-  void GetNormalizedWeights(const VectorBase<BaseFloat> &unnorm_weights,
-                            VectorBase<BaseFloat> *norm_weights) const;
+  void GetNormalizedWeights(const VectorBase<double> &unnorm_weights,
+                            VectorBase<double> *norm_weights) const;
+
+  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
+  // weights_penalty_deriv to 0.0; else it computes, for each
+  // updatable component u the total weight w_u, returns the value
+  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
+  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
+  // the result.
+  // Note: config_.sum_to_one_penalty is exclusive with
+  // config_.enforce_sum_to_one, so there is really no distinction between
+  // normalized and unnormalized weights here (since normalization would be a
+  // no-op).
+  double GetSumToOnePenalty(const VectorBase<double> &weights,
+                            VectorBase<double> *weights_penalty_deriv,
+                            bool print_weights = false) const;
+
 
   // Computes the nnet-parameter vector from the normalized weights and
   // nnet_params_, as a vector.  (See the functions Vectorize() and
   // UnVectorize() for how they relate to the nnet's components' parameters).
-  void GetNnetParameters(const Vector<BaseFloat> &normalized_weights,
+  void GetNnetParameters(const Vector<double> &normalized_weights,
                          VectorBase<BaseFloat> *nnet_params) const;
 
   // This function computes the objective function (and its derivative, if the objective
@@ -197,23 +220,23 @@ class NnetCombiner {
   // Given an objective-function derivative with respect to the nnet parameters,
   // computes the derivative with respect to the (normalized) weights.
   void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
-                       VectorBase<BaseFloat> *normalized_weights_deriv);
+                       VectorBase<double> *normalized_weights_deriv);
 
 
   // Computes the derivative w.r.t. the unnormalized weights, by propagating
   // through the normalization operation.
   // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
   // unnorm_weights_deriv.
-  void GetUnnormalizedWeightsDeriv(const VectorBase<BaseFloat> &unnorm_weights,
-                                   const VectorBase<BaseFloat> &norm_weights_deriv,
-                                   VectorBase<BaseFloat> *unnorm_weights_deriv);
+  void GetUnnormalizedWeightsDeriv(const VectorBase<double> &unnorm_weights,
+                                   const VectorBase<double> &norm_weights_deriv,
+                                   VectorBase<double> *unnorm_weights_deriv);
 
 
   // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
   // the params
-  void GetParamsDeriv(const VectorBase<BaseFloat> &weights,
-                      const VectorBase<BaseFloat> &weight_deriv,
-                      VectorBase<BaseFloat> *param_deriv);
+  void GetParamsDeriv(const VectorBase<double> &weights,
+                      const VectorBase<double> &weight_deriv,
+                      VectorBase<double> *param_deriv);
 
   void ComputeUpdatableComponentDims();
   void FinishPreprocessingInput();
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 918055df62d..6c4fc0f7a1c 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -167,7 +167,7 @@ static void WriteCindexVectorElementBinary(
     // this separator.
     os.put('|');
     WriteBasicType(os, binary, node_index);
-  }  
+  }
   if (i == 0) {
     // we don't need to be concerned about reserving space for character 124
     // ('|') here, since (wastefully) '|' is always printed for i == 0.
@@ -280,11 +280,11 @@ void WriteCindexVector(std::ostream &os, bool binary,
         os.put('[');
         WriteBasicType(os, binary, node_index);
         os.put(':');
-      } 
+      }
       vec[i].second.Write(os, binary);
       if (i == size - 1)
         os.put(']');
-    } 
+    }
   } else {
     for (int32 i = 0; i < size; i++)
       WriteCindexVectorElementBinary(os, vec, i);
@@ -326,7 +326,7 @@ void ReadCindexVector(std::istream &is, bool binary,
         (*vec)[i].first = (*vec)[i-1].first;
       }
       (*vec)[i].second.Read(is, binary);
-      if (i == size - 1) { 
+      if (i == size - 1) {
         is >> std::ws;
         if (is.peek() == static_cast<int>(']')) {
           is.get();
@@ -342,14 +342,14 @@ void ReadCindexVector(std::istream &is, bool binary,
   }
 }
 
-size_t IndexHasher::operator () (const Index &index) const {
+size_t IndexHasher::operator () (const Index &index) const noexcept {
   // The numbers that appear below were chosen arbitrarily from a list of primes
   return index.n +
       1619 * index.t +
       15649 * index.x;
 }
 
-size_t CindexHasher::operator () (const Cindex &cindex) const {
+size_t CindexHasher::operator () (const Cindex &cindex) const noexcept {
   // The numbers that appear below were chosen arbitrarily from a list of primes
   return cindex.first +
        1619 * cindex.second.n +
@@ -358,6 +358,49 @@ size_t CindexHasher::operator () (const Cindex &cindex) const {
 
 }
 
+size_t CindexVectorHasher::operator () (
+    const std::vector<Cindex> &cindex_vector) const noexcept {
+  // this is an arbitrarily chosen prime.
+  size_t kPrime = 23539, ans = 0;
+  std::vector<Cindex>::const_iterator iter = cindex_vector.begin(),
+      end = cindex_vector.end();
+  CindexHasher cindex_hasher;
+  for (; iter != end; ++iter)
+    ans = cindex_hasher(*iter) + kPrime * ans;
+  return ans;
+}
+
+size_t IndexVectorHasher::operator () (
+    const std::vector<Index> &index_vector) const noexcept {
+  size_t n1 = 15, n2 = 10;  // n1 and n2 are used to extract only a subset of
+                            // elements to hash; this makes the hasher faster by
+                            // skipping over more elements.  Setting n1 large or
+                            // n2 to 1 would make the hasher consider all
+                            // elements.
+  // all long-ish numbers appearing below are randomly chosen primes.
+  size_t ans = 1433 + 34949  * index_vector.size();
+  std::vector<Index>::const_iterator iter = index_vector.begin(),
+      end = index_vector.end(), med = end;
+  if (med > iter + n1)
+    med = iter + n1;
+
+  for (; iter != med; ++iter) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
+  }
+  // after the first n1 values, look only at every n2'th value.  this makes the
+  // hashing much faster, and in the kinds of structures that we actually deal
+  // with, we shouldn't get unnecessary hash collisions as a result of this
+  // optimization.
+  for (; iter < end; iter += n2) {
+    ans += iter->n * 1619;
+    ans += iter->t * 15649;
+    ans += iter->x * 89809;
+  }
+  return ans;
+}
+
 std::ostream &operator << (std::ostream &ostream, const Index &index) {
   return ostream << '(' << index.n << ' ' << index.t << ' ' << index.x << ')';
 }
diff --git a/src/nnet3/nnet-common.h b/src/nnet3/nnet-common.h
index f8140e62f12..3f80645ec22 100644
--- a/src/nnet3/nnet-common.h
+++ b/src/nnet3/nnet-common.h
@@ -55,13 +55,19 @@ struct Index {
   bool operator < (const Index &a) const {
     if (t < a.t) { return true; }
     else if (t > a.t) { return false; }
-    else if (n < a.n) { return true; }
-    else if (n > a.n) { return false; }
-    else return (x < a.x);
+    else if (x < a.x) { return true; }
+    else if (x > a.x) { return false; }
+    else return (n < a.n);
   }
   Index operator + (const Index &other) const {
     return Index(n+other.n, t+other.t, x+other.x);
   }
+  Index &operator += (const Index &other) {
+    n += other.n;
+    t += other.t;
+    x += other.x;
+    return *this;
+  }
 
   void Write(std::ostream &os, bool binary) const;
 
@@ -100,11 +106,21 @@ void ReadIndexVector(std::istream &is, bool binary,
 typedef std::pair<int32, Index> Cindex;
 
 struct IndexHasher {
-  size_t operator () (const Index &cindex) const;
+  size_t operator () (const Index &cindex) const noexcept;
 };
 
 struct CindexHasher {
-  size_t operator () (const Cindex &cindex) const;
+  size_t operator () (const Cindex &cindex) const noexcept;
+};
+
+struct CindexVectorHasher {
+  size_t operator () (const std::vector<Cindex> &cindex_vector) const noexcept;
+};
+
+// Note: because IndexVectorHasher is used in some things where we really need
+// it to be fast, it doesn't look at all the indexes, just most of them.
+struct IndexVectorHasher {
+  size_t operator () (const std::vector<Index> &index_vector) const noexcept;
 };
 
 
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
new file mode 100644
index 00000000000..70f88615ab9
--- /dev/null
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -0,0 +1,345 @@
+// nnet3/nnet-compile-looped.cc
+
+// Copyright      2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-compile-looped.h"
+#include "nnet3/nnet-optimize-utils.h"
+#include "nnet3/nnet-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void ModifyNnetIvectorPeriod(int32 ivector_period,
+                             Nnet *nnet) {
+  KALDI_ASSERT(ivector_period > 0);
+  std::vector<std::string> config_lines;
+  nnet->GetConfigLines(false, &config_lines);
+  std::ostringstream config_to_read;
+  for (size_t i = 0; i < config_lines.size(); i++) {
+    std::string s = config_lines[i];
+    ConfigLine config_line;
+    bool b = config_line.ParseLine(config_lines[i]);
+    KALDI_ASSERT(b && "Could not parse config line.");
+    if (config_line.FirstToken() == "component-node") {
+      std::string whole_line = config_lines[i];
+      std::string to_search_for = "ReplaceIndex(ivector, t, 0)";
+      std::string::size_type pos = whole_line.find(to_search_for);
+      if (pos != std::string::npos) {
+        std::ostringstream to_replace_with;
+        to_replace_with << "Round(ivector, " << ivector_period << ")";
+        whole_line.replace(pos, to_search_for.size(), to_replace_with.str());
+        config_to_read << whole_line << "\n";
+      }
+    }
+  }
+  if (!config_to_read.str().empty()) {
+    std::istringstream is(config_to_read.str());
+    nnet->ReadConfig(is);
+  }
+}
+
+
+int32 GetChunkSize(const Nnet &nnet,
+                   int32 frame_subsampling_factor,
+                   int32 advised_chunk_size) {
+  int32 modulus = nnet.Modulus();
+  KALDI_ASSERT(modulus > 0 && frame_subsampling_factor > 0 &&
+               advised_chunk_size > 0);
+  int32 chunk_size = advised_chunk_size;
+  while (1) {
+    if (chunk_size % modulus == 0 &&
+        chunk_size % frame_subsampling_factor == 0)
+      return chunk_size;
+    chunk_size++;
+  }
+}
+
+
+/// Mod(m, n), defined for integers m and n where n > 0, returns
+/// the modulus m % n, defined as the integer 0 <= i < n
+/// such that i and m are congruent modulo n; for instance,
+/// Mod(13, 10) = 3.
+/// This is like the % operation in C/C++, except that it always returns a
+/// positive value even for negative m; in 99% of cases where it makes a
+/// difference, this is what you want.  In the C/C++ standard, the sign of a % b
+/// for negative a is not specified (except by relation with the division '/'
+/// operator), but in practice it would be <= 0 for almost all implementations.
+template<class I> I  Mod(I m, I n) {
+  I ans = m % n;
+  if (ans < 0) ans += n;
+  return ans;
+}
+
+
+static void CreateComputationRequestInternal(
+    int32 begin_input_t, int32 end_input_t,
+    int32 begin_output_t, int32 end_output_t,
+    int32 num_sequences,
+    int32 frame_subsampling_factor,
+    const std::set<int32> &ivector_times,
+    ComputationRequest *request) {
+  request->inputs.reserve(2);
+  request->inputs.clear();
+  request->inputs.resize(1 + (ivector_times.empty() ? 0 : 1));
+  request->inputs[0].name = "input";
+  request->inputs[0].has_deriv = false;
+  request->outputs.clear();
+  request->outputs.resize(1);
+  request->outputs[0].name = "output";
+  request->outputs[0].has_deriv = false;
+  if (!ivector_times.empty()) {
+    request->inputs[1].name = "ivector";
+    request->inputs[1].has_deriv = false;
+  }
+
+  // in the computation request the 'n' indexes (the sequence/utterance indexes)
+  // have the larger stride than 't', although this is opposite to the way it's
+  // done inside the computation.  This is for user convenience where it may be
+  // easier to deal with submatrixes per sequence.
+  for (int32 n = 0; n < num_sequences; n++) {
+    int32 x = 0;
+    for (int32 t = begin_input_t; t < end_input_t; t++) {
+      request->inputs[0].indexes.push_back(Index(n, t, x));
+    }
+    for (int32 t = begin_output_t;
+         t < end_output_t;
+         t += frame_subsampling_factor)
+      request->outputs[0].indexes.push_back(Index(n, t, x));
+  }
+  if (!ivector_times.empty()) {
+    request->inputs.resize(2);
+    request->inputs[1].name = "ivector";
+    request->inputs[1].has_deriv = false;
+    for (int32 n = 0; n < num_sequences; n++) {
+      // note: std::sets store things in sorted order.
+      for (std::set<int32>::const_iterator iter = ivector_times.begin();
+           iter != ivector_times.end(); ++iter) {
+        int32 t = *iter, x = 0;
+        request->inputs[1].indexes.push_back(Index(n, t, x));
+      }
+    }
+  }
+}
+
+
+void CreateLoopedComputationRequestSimple(const Nnet &nnet,
+                                          int32 chunk_size,
+                                          int32 frame_subsampling_factor,
+                                          int32 ivector_period,
+                                          int32 extra_left_context_begin,
+                                          int32 extra_right_context,
+                                          int32 num_sequences,
+                                          ComputationRequest *request1,
+                                          ComputationRequest *request2,
+                                          ComputationRequest *request3) {
+  bool has_ivector = (nnet.InputDim("ivector") > 0);
+  int32 left_context, right_context;
+  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
+  KALDI_ASSERT(chunk_size % frame_subsampling_factor == 0 &&
+               chunk_size % nnet.Modulus() == 0 &&
+               chunk_size % ivector_period == 0);
+  KALDI_ASSERT(extra_left_context_begin >= 0 && extra_right_context >= 0);
+  // note, 'end' is one past the last one.
+  int32 chunk1_input_begin_t = - left_context - extra_left_context_begin,
+      chunk1_input_end_t = chunk_size + right_context + extra_right_context,
+      chunk2_input_begin_t = chunk1_input_end_t,
+      chunk2_input_end_t = chunk2_input_begin_t + chunk_size,
+      chunk3_input_begin_t = chunk2_input_end_t,
+      chunk3_input_end_t = chunk3_input_begin_t + chunk_size;
+
+
+  // work out the times at which i-vectors are required.
+  std::set<int32> ivector_times1, ivector_times2, ivector_times3;
+  if (has_ivector) {
+    for (int32 t = chunk1_input_begin_t; t < chunk1_input_end_t; t++) {
+      int32 ivector_t = t - Mod(t, ivector_period);
+      ivector_times1.insert(ivector_t);
+    }
+    for (int32 t = chunk2_input_begin_t; t < chunk2_input_end_t; t++) {
+      int32 ivector_t = t - Mod(t, ivector_period);
+      if (ivector_times2.count(ivector_t) == 0 &&
+	  ivector_times1.count(ivector_t) == 0)
+        ivector_times2.insert(ivector_t);
+    }
+    for (int32 t = chunk3_input_begin_t; t < chunk3_input_end_t; t++) {
+      int32 ivector_t = t - Mod(t, ivector_period);
+      if (ivector_times3.count(ivector_t) == 0 &&
+          ivector_times2.count(ivector_t) == 0 &&
+	  ivector_times1.count(ivector_t) == 0)
+        ivector_times3.insert(ivector_t);
+    }
+  }
+
+  CreateComputationRequestInternal(
+      chunk1_input_begin_t, chunk1_input_end_t,
+      0, chunk_size,
+      num_sequences, frame_subsampling_factor,
+      ivector_times1,
+      request1);
+
+  CreateComputationRequestInternal(
+      chunk2_input_begin_t, chunk2_input_end_t,
+      chunk_size, chunk_size * 2,
+      num_sequences, frame_subsampling_factor,
+      ivector_times2,
+      request2);
+
+  CreateComputationRequestInternal(
+      chunk3_input_begin_t, chunk3_input_end_t,
+      chunk_size * 2, chunk_size * 3,
+      num_sequences, frame_subsampling_factor,
+      ivector_times3,
+      request3);
+
+}
+
+
+
+void AddTimeOffsetToComputationRequest(int32 t_offset,
+                                       ComputationRequest *request) {
+  for (size_t i = 0; i < request->inputs.size(); i++) {
+    size_t size = request->inputs[i].indexes.size();
+    for (size_t j = 0; j < size; j++)
+      request->inputs[i].indexes[j].t += t_offset;
+  }
+  for (size_t i = 0; i < request->outputs.size(); i++) {
+    size_t size = request->outputs[i].indexes.size();
+    for (size_t j = 0; j < size; j++)
+      request->outputs[i].indexes[j].t += t_offset;
+  }
+}
+
+
+
+static bool ExtrapolateComputationRequest(
+    const ComputationRequest &request1,
+    const ComputationRequest &request2,
+    ComputationRequest *request3) {
+  // accepts two computation requests 'request1' and 'request2' that
+  // must be identical except for a time offset, and creates 'request3'
+  // that is the extrapolation of the next term in sequence.
+  *request3 = request2;
+  KALDI_ASSERT(!request1.inputs.empty() && !request1.inputs[0].indexes.empty() &&
+               !request2.inputs.empty() && !request2.inputs[0].indexes.empty());
+  int32 t_offset = request2.inputs[0].indexes[0].t -
+      request1.inputs[0].indexes[0].t;
+  // the following is just to make sure that the inputs are structurally
+  // equivalent.
+  AddTimeOffsetToComputationRequest(-t_offset, request3);
+  if (!(*request3 == request1))
+    return false;  // there is somse structural difference, or
+                   // the time offset is not consistent.
+  // the following reverses the last call to AddTimeOffsetToComputationRequest,
+  // then adds the offset we want.
+  AddTimeOffsetToComputationRequest(2 * t_offset, request3);
+  return true;
+}
+
+
+/* Internal version of CompileLooped where
+   you specify the the number of computation requests (must be >= 3).
+   Returns true on success.
+   It's possible for the optimization to fail if you give too small
+   a value of 'num_requests' (this depends on the network topology),
+   and in that case this function will return false and you should re-try
+   with a higher value of num_requests.
+ */
+static bool CompileLoopedInternal(
+    const Nnet &nnet,
+    NnetOptimizeOptions optimize_opts,
+    const ComputationRequest &request1,
+    const ComputationRequest &request2,
+    const ComputationRequest &request3,
+    int32 num_requests,
+    NnetComputation *computation) {
+
+  KALDI_ASSERT(num_requests >= 3);
+  std::vector<ComputationRequest> extra_requests(num_requests - 3);
+  const ComputationRequest *prev_request = &request2;
+  const ComputationRequest *cur_request = &request3;
+  for (int32 i = 0; i < num_requests - 3; i++) {
+    if (!ExtrapolateComputationRequest(*prev_request, *cur_request,
+                                       &(extra_requests[i]))) {
+      KALDI_LOG << "prev_request is:";
+      prev_request->Print(std::cerr);
+      KALDI_LOG << "cur_request is:";
+      cur_request->Print(std::cerr);
+      KALDI_ERR << "Computation requests do not have the right relationship";
+    }
+    prev_request = cur_request;
+    cur_request = &(extra_requests[i]);
+  }
+
+  std::vector<const ComputationRequest*> requests;
+  requests.push_back(&request1);
+  requests.push_back(&request2);
+  requests.push_back(&request3);
+  for (int32 i = 0; i < num_requests - 3; i++)
+    requests.push_back(&(extra_requests[i]));
+  Compiler compiler(requests, nnet);
+  CompilerOptions compiler_opts;
+  compiler.CreateComputation(compiler_opts, computation);
+  optimize_opts.optimize_looped_computation = true;
+
+  Optimize(optimize_opts, nnet,
+           MaxOutputTimeInRequest(request3), computation);
+
+  return computation->commands.size() != 0 &&
+      computation->commands.back().command_type == kGotoLabel;
+}
+
+void CompileLooped(const Nnet &nnet,
+                   const NnetOptimizeOptions &optimize_opts,
+                   const ComputationRequest &request1,
+                   const ComputationRequest &request2,
+                   const ComputationRequest &request3,
+                   NnetComputation *computation) {
+  int32 num_requests1 = 5, factor = 2, max_requests = 100,
+      num_requests;
+
+  Timer timer;
+
+  for (num_requests = num_requests1; num_requests <= max_requests;
+       num_requests *= factor) {
+    if (CompileLoopedInternal(nnet, optimize_opts,
+                             request1, request2, request3,
+                             num_requests, computation)) {
+      KALDI_LOG << "Spent " << timer.Elapsed()
+                << " seconds in looped compilation.";
+      return;
+    } else {
+      KALDI_VLOG(2) << "Looped compilation failed with "
+                    << num_requests << " requests, trying "
+                    << (num_requests * factor);
+    }
+  }
+  KALDI_ERR << "Looped compilation failed with "
+            << (num_requests/factor) << " requests, which "
+            << "we expect should be enough... something "
+            << "went wrong.";
+}
+
+
+
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h
new file mode 100644
index 00000000000..f6ff47045fe
--- /dev/null
+++ b/src/nnet3/nnet-compile-looped.h
@@ -0,0 +1,174 @@
+// nnet3/nnet-compile-looped.h
+
+// Copyright      2016  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_COMPILE_LOOPED_H_
+#define KALDI_NNET3_NNET_COMPILE_LOOPED_H_
+
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-utils.h"
+
+#include <list>
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/**
+   CompileLooped() provides an internal interface for 'looped' computation.
+   It's usable for inference only (not training), meaning that backprop is
+   not supported (for now, at least).  CompileLooped() allows you to do the
+   neural net computation for small chunks with increasing 't' values, and
+   naturally cache the intermediate activations (rather than recomputing them
+   every time you see new input data).
+
+   This function does both compilation and optimization, so it's like a combination of
+   Compiler::CreateComputation() [nnet-compile.h] and Optimize() [nnet-optimize.h].
+
+   You provide 3 computation requests.  request1 is the first computation
+   request of an utterance (or other type of segment) that contains any required
+   extra left context in the input.  request2 and request3 are the second and
+   third computation request, and must have exactly the same structure, except
+   for a fixed time offset (change in 't' index) between them.  This will be
+   extrapolated to an infinite sequence of further requests (request4,
+   request5, etc.).  In practice the way it's done is that we extrapolate
+   to a small finite number of requests (like 10), and then attempt to
+   identify a common structure in the computation where, after processing,
+   as an example, the 3nd computation request, the active variables can
+   be identified with those present at, say, the 7th computation request, and
+   we then cut and splice the computation together at this points, like
+   making a tape loop, by adding a goto statement that jumps from the end of
+   the 7th computation request to the end of the 3rd computation request.
+   We also have to identify the variables with each other (merge variables).
+
+   That's done in the optimization code.
+ */
+void CompileLooped(const Nnet &nnet,
+                   const NnetOptimizeOptions &optimize_opts,
+                   const ComputationRequest &request1,
+                   const ComputationRequest &request2,
+                   const ComputationRequest &request3,
+                   NnetComputation *computation);
+
+/*
+  This function gives you a suitable chunk size, which is the smallest number >=
+  'advised_chunk_size' that is an exact multiple of nnet.Modulus() and
+  frame_subsampling_factor.  This will ensure that all the chunks have the same
+  structure, which makes compiling the looped computation a little more
+  straightforward.
+ */
+int32 GetChunkSize(const Nnet &nnet,
+                   int32 frame_subsampling_factor,
+                   int32 advised_chunk_size);
+
+/**
+   This function modifies the descriptors in the neural network to change the
+   periodicity with which it expects to read an iVector at its input.
+
+   We normally train neural networks that expect to see an iVector at frame zero
+   only; this is because we train on fixed-size chunks and the iVector doesn't
+   change that much within each chunk.  However, expecting just one iVector
+   isn't that convenient for looped recognition because it changes with
+   time, so we modify the iVector input period in the network by replacing
+   expressions like ReplaceIndex(ivector, t, 0) or just "t", with
+   Round(ivector, 10) [assuming ivector_period == 10].  This won't work
+   in every conceivable network, but it does do what you want in the
+   cases of interest.
+
+   It does this in a rather simple way, by getting the config lines that
+   correspond to descriptors, and doing a search-and-replace.  It's
+   maybe not ideal, but it was the easiest way to do it.
+
+ */
+void ModifyNnetIvectorPeriod(int32 ivector_period,
+                             Nnet *nnet);
+
+/**
+  This function creates computation request suitable for giving to ComputeLooped().
+  It's intended for use with a 'simple' nnet (one satisfying IsSimpleNnet()), and this
+  basically means that the inputs must be named "input" and possibly "ivector",
+  and that there is an output named "output", and that those are the ones you
+  care about (it won't generate any other outputs or use any other inputs).
+
+  If you want to use looped computation for different types of neural net, you
+  should use the deeper interface, CompileLooped().
+
+   @param [in] nnet   The neural net this computation request is to be used with.
+               This is used to check whether the neural net accepts iVectors,
+               and to work out the left-context and right-context required
+               by the network.
+   @param [in] chunk_size  The number of frames of output that will be generated
+               for each chunk (note: this is the shift in the t-index, which will not
+               equal the number of output frames if frame_subsampling_factor != 1).
+               Note: it is required that chunk_size be a multiple of ivector_period,
+               frame_subsampling_factor, and nnet.Modulus().  You should use
+               GetChunkSize() to compute the chunk size, giving it an advisory/
+               minimum chunksize, to make sure it satisfies these properties.
+   @param [in] frame_subsampling_factor  This will normally be 1, but may be
+               more than 1 (e.g. 3) in chain systems; it determines the frame-skipping
+               on the output, so we evaluate the output with 't' at multiples of
+               this value.
+   @param [in] ivector_period The period with which iVectors are to be supplied
+               to the network (if you're using iVectors).  Not necessarily the
+               same as the period with which the ivectors are extracted or
+               stored on disk (--online-ivector-period).  You will normally set
+               this to the chunk size.  It must divide the chunk size (if you're
+               using iVectors) Note: you should call ModifyNnetIvectorPeriod on
+               'nnet' before calling this function; otherwise the neural net
+               will most likely not actually be able to consume the iVector with
+               this frequency.
+   @param [in] extra_left_context_begin  The additional left-context that
+               should be supplied to the network on top of the minimum
+               that the network requires.  We call this extra_left_context_begin
+               because this only relates to the start of the utterance (t=0).
+   @param [in] num_sequences  The number of separate 'n' values to put in the computation;
+               normally this will be just 1, but it can be increased to allow
+               simultaneous operation on multiple streams of input.
+   @param [out] request1 The first of the 3 requests that this function
+               generates, that the user should then supply to CompileLooped().
+               Note: this will tend to be the largest computation request in
+               terms of input, because we have to provide enough left and right
+               context that it can evaluate the first chunk.  Note: as
+               elsewhere, the job of duplicating first and last frames enough to
+               provide the required left/right context to the network, is left
+               to the caller (at runtime, not during compilation).
+   @param [out] request2  The second of the 3 requests that this function generates.
+               Caution: none of the inputs and outputs should overlap.
+   @param [out] request3  The third of the 3 requests that this function generates.
+                It will be the same as request2, except for a time offset.
+*/
+void CreateLoopedComputationRequestSimple(const Nnet &nnet,
+                                          int32 chunk_size,
+                                          int32 frame_subsampling_factor,
+                                          int32 ivector_period,
+                                          int32 extra_left_context_begin,
+                                          int32 extra_right_context,
+                                          int32 num_sequences,
+                                          ComputationRequest *request1,
+                                          ComputationRequest *request2,
+                                          ComputationRequest *request3);
+
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif
diff --git a/src/nnet3/nnet-compile-test.cc b/src/nnet3/nnet-compile-test.cc
index d405fd0f5fa..1b9c0d3e381 100644
--- a/src/nnet3/nnet-compile-test.cc
+++ b/src/nnet3/nnet-compile-test.cc
@@ -19,6 +19,7 @@
 
 #include "nnet3/nnet-nnet.h"
 #include "nnet3/nnet-compile.h"
+#include "nnet3/nnet-compile-looped.h"
 #include "nnet3/nnet-test-utils.h"
 
 namespace kaldi {
@@ -28,7 +29,6 @@ namespace nnet3 {
 void UnitTestNnetCompile() {
   for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
-
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -56,15 +56,133 @@ void UnitTestNnetCompile() {
   }
 }
 
+
+// this tests compilation where there are more than one
+// computation-request... this is to test some of the
+// low-level utilities that will be used in looped computation.
+void UnitTestNnetCompileMulti() {
+  for (int32 n = 0; n < 20; n++) {
+    struct NnetGenerationOptions gen_config;
+    gen_config.allow_use_of_x_dim = false;
+
+    std::vector<std::string> configs;
+    GenerateConfigSequence(gen_config, &configs);
+    Nnet nnet;
+    for (size_t j = 0; j < configs.size(); j++) {
+      KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
+      std::istringstream is(configs[j]);
+      nnet.ReadConfig(is);
+    }
+
+    ComputationRequest request1, request2;
+    std::vector<Matrix<BaseFloat> > inputs1, inputs2;
+    ComputeExampleComputationRequestSimple(nnet, &request1, &inputs1);
+    ComputeExampleComputationRequestSimple(nnet, &request2, &inputs2);
+
+
+    KALDI_LOG << "Computation request 1 is:";
+    request1.Print(std::cerr);
+    KALDI_LOG << "Computation request 2 is:";
+    request2.Print(std::cerr);
+
+    std::vector<const ComputationRequest*> requests;
+    request2.store_component_stats = request1.store_component_stats;
+    request1.need_model_derivative = false;
+    request2.need_model_derivative = false;
+    requests.push_back(&request1);
+    requests.push_back(&request2);
+
+    // set all the x indexes to 1 for request 2 (they would otherwise
+    // be zero).  This ensures that there is no overlap
+    // between the inputs and outputs on the two requests.
+    for (int32 i = 0; i < request2.inputs.size(); i++)
+      for (int32 j = 0; j < request2.inputs[i].indexes.size(); j++)
+        request2.inputs[i].indexes[j].x = 1;
+    for (int32 i = 0; i < request2.outputs.size(); i++)
+      for (int32 j = 0; j < request2.outputs[i].indexes.size(); j++)
+        request2.outputs[i].indexes[j].x = 1;
+
+
+    NnetComputation computation;
+    Compiler compiler(requests, nnet);
+
+    CompilerOptions opts;
+    compiler.CreateComputation(opts, &computation);
+
+    std::ostringstream os;
+    computation.Print(os, nnet);
+    KALDI_LOG << "Generated computation is: " << os.str();
+  }
+}
+
+
+
+void UnitTestNnetCompileLooped() {
+  for (int32 n = 0; n < 20; n++) {
+    struct NnetGenerationOptions gen_config;
+    gen_config.allow_ivector = true;
+
+    std::vector<std::string> configs;
+    GenerateConfigSequence(gen_config, &configs);
+    Nnet nnet;
+    for (size_t j = 0; j < configs.size(); j++) {
+      KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
+      std::istringstream is(configs[j]);
+      nnet.ReadConfig(is);
+    }
+
+    ComputationRequest request1, request2, request3;
+    int32 chunk_size_min = RandInt(5, 15);
+    int32 frame_subsampling_factor = RandInt(1, 3),
+        extra_left_context_begin = RandInt(0, 10),
+        extra_right_context = RandInt(0, 10),
+        num_sequences = RandInt(1, 2);
+    int32 chunk_size = GetChunkSize(nnet, frame_subsampling_factor,
+                                    chunk_size_min),
+        ivector_period = chunk_size;
+
+
+
+    ModifyNnetIvectorPeriod(ivector_period, &nnet);
+    KALDI_LOG << "Nnet info after modifying ivector period is: "
+              << nnet.Info();
+    CreateLoopedComputationRequestSimple(
+        nnet, chunk_size, frame_subsampling_factor,
+        ivector_period, extra_left_context_begin, extra_right_context,
+        num_sequences, &request1, &request2, &request3);
+
+    KALDI_LOG << "Computation request 1 is:";
+    request1.Print(std::cerr);
+    KALDI_LOG << "Computation request 2 is:";
+    request2.Print(std::cerr);
+    KALDI_LOG << "Computation request 3 is:";
+    request3.Print(std::cerr);
+
+    NnetOptimizeOptions optimize_opts;
+    // todo: set optimize-looped=true.
+    NnetComputation computation;
+    CompileLooped(nnet, optimize_opts,
+                  request1, request2, request3,
+                  &computation);
+    KALDI_LOG << "Compiled looped computation is ";
+    computation.Print(std::cerr, nnet);
+  }
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  // SetVerboseLevel(2);
+  SetVerboseLevel(4);
 
+  UnitTestNnetCompileLooped();
   UnitTestNnetCompile();
+  UnitTestNnetCompileMulti();
+
 
   KALDI_LOG << "Nnet tests succeeded.";
 
diff --git a/src/nnet3/nnet-compile-utils-test.cc b/src/nnet3/nnet-compile-utils-test.cc
index e5c9e24cc46..53820abf32a 100644
--- a/src/nnet3/nnet-compile-utils-test.cc
+++ b/src/nnet3/nnet-compile-utils-test.cc
@@ -71,10 +71,10 @@ void UnitTestSplitLocationsBackward(bool verbose) {
   int32 minibatch_size = Rand() % 1024 + 100;
   int32 num_submat_indexes = Rand() % 10 + 1;
   int32 max_submat_list_size = Rand() % 10 + 1;
-  int32 min_num_kAddRows = Rand() % 2; // minimum number of kAddRows compatible
+  int32 min_num_kaddrows = Rand() % 2; // minimum number of kAddRows compatible
   // lists expected in the final split lists. This value will be used to
   // create input submat_lists so that this is guaranteed
-  max_submat_list_size = min_num_kAddRows + max_submat_list_size;
+  max_submat_list_size = min_num_kaddrows + max_submat_list_size;
 
   std::vector<std::pair<int32, int32> > all_pairs;
   all_pairs.reserve(minibatch_size * max_submat_list_size);
@@ -95,8 +95,8 @@ void UnitTestSplitLocationsBackward(bool verbose) {
         num_locations : max_generated_submat_list_size;
     submat_lists[i].reserve(num_locations);
     for (int32 j = 0; j < num_locations; j++) {
-      if (j <= min_num_kAddRows)
-        // since we need min_num_kAddRows in the split_lists we ensure that
+      if (j <= min_num_kaddrows)
+        // since we need min_num_kaddrows in the split_lists we ensure that
         // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
                            Rand() % minibatch_size));
@@ -148,7 +148,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
     PrintVectorVectorPair(split_lists);
     KALDI_LOG << "===========================";
   }
-  int32 num_kAddRows_in_output = 0;
+  int32 num_kaddrows_in_output = 0;
   int32 first_value;
   std::vector<int32> second_values;
   // ensure that elements in submat_lists are also present
@@ -163,7 +163,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
           KALDI_ASSERT((split_lists[i][j].first == first_value) &&
                        (split_lists[i][j].second == second_values[j]));
       }
-      num_kAddRows_in_output++;
+      num_kaddrows_in_output++;
     }
     for (int32 j = 0; j < split_lists[i].size(); j++) {
       if (split_lists[i][j].first == -1)
@@ -178,7 +178,7 @@ void UnitTestSplitLocationsBackward(bool verbose) {
   KALDI_ASSERT(all_pairs.size() == 0);
   // ensure that there are at least as many kAddRows compatible split_lists as
   // specified
-  KALDI_ASSERT(num_kAddRows_in_output >= min_num_kAddRows);
+  KALDI_ASSERT(num_kaddrows_in_output >= min_num_kaddrows);
 }
 
 
@@ -276,10 +276,10 @@ void UnitTestSplitLocations(bool verbose) {
   int32 minibatch_size = Rand() % 1024 + 100;
   int32 num_submat_indexes = Rand() % 10 + 1;
   int32 max_submat_list_size = Rand() % 10 + 1;
-  int32 min_num_kAddRows = Rand() % 2; // minimum number of kAddRows compatible
+  int32 min_num_kaddrows = Rand() % 2; // minimum number of kAddRows compatible
   // lists expected in the final split lists. This value will be used to
   // create input submat_lists so that this is guaranteed
-  max_submat_list_size = min_num_kAddRows + max_submat_list_size;
+  max_submat_list_size = min_num_kaddrows + max_submat_list_size;
 
   std::vector<std::pair<int32, int32> > all_pairs;
   all_pairs.reserve(minibatch_size * max_submat_list_size);
@@ -300,12 +300,14 @@ void UnitTestSplitLocations(bool verbose) {
         num_locations : max_generated_submat_list_size;
     submat_lists[i].reserve(num_locations);
     for (int32 j = 0; j < num_locations; j++) {
-      if (j <= min_num_kAddRows)
-        // since we need min_num_kAddRows in the split_lists we ensure that
+      // note from dan: I edited the following line to resolve a valgrind error
+      // but cannot really understand at this point what this code is doing.
+      if (j <= min_num_kaddrows && j < num_submat_indexes) {
+        // since we need min_num_kaddrows in the split_lists we ensure that
         // we add a pair with the same first element in all the submat_lists
         submat_lists[i].push_back(std::make_pair(submat_indexes[j],
-                           Rand() % minibatch_size));
-
+                                                 Rand() % minibatch_size));
+      }
       submat_lists[i].push_back(
           std::make_pair(submat_indexes[Rand() % num_submat_indexes],
                          Rand() % minibatch_size));
@@ -323,7 +325,7 @@ void UnitTestSplitLocations(bool verbose) {
     KALDI_LOG << "===========================";
     KALDI_LOG << split_lists.size();
   }
-  int32 num_kAddRows_in_output = 0;
+  int32 num_kaddrows_in_output = 0;
   int32 first_value;
   std::vector<int32> second_values;
   // ensure that elements in submat_lists are also present
@@ -337,7 +339,7 @@ void UnitTestSplitLocations(bool verbose) {
           KALDI_ASSERT((split_lists[i][j].first == first_value) &&
                        (split_lists[i][j].second == second_values[j]));
       }
-      num_kAddRows_in_output++;
+      num_kaddrows_in_output++;
     }
     for (int32 j = 0; j < split_lists[i].size(); j++) {
       if (split_lists[i][j].first == -1)
@@ -352,7 +354,7 @@ void UnitTestSplitLocations(bool verbose) {
   KALDI_ASSERT(all_pairs.size() == 0);
   // ensure that there are at least as many kAddRows compatible split_lists as
   // specified
-  KALDI_ASSERT(num_kAddRows_in_output >= min_num_kAddRows);
+  KALDI_ASSERT(num_kaddrows_in_output >= min_num_kaddrows);
 }
 
 } // namespace nnet2
diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 42ca5d7a83e..4ea0ecd5e05 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -21,59 +21,121 @@
 #include <sstream>
 #include "nnet3/nnet-compile.h"
 #include "nnet3/nnet-compile-utils.h"
+#include "nnet3/nnet-optimize.h"  // just for ConsolidateIoOperations().
 
 namespace kaldi {
 namespace nnet3 {
 
 Compiler::Compiler(
     const ComputationRequest &request,
-    const Nnet &nnet): request_(request), nnet_(nnet) { }
+    const Nnet &nnet): nnet_(nnet) {
+  requests_.push_back(&request);
+}
 
+Compiler::Compiler(
+    const std::vector<const ComputationRequest*> &requests,
+    const Nnet &nnet): requests_(requests), nnet_(nnet) {
+  KALDI_ASSERT(requests_.size() >= 1);
+  // We are currently not supporting getting model derivatives for multi-segment
+  // (online) computations.
+  if (requests_.size() != 1) {
+    for (size_t i = 0; i < requests_.size(); i++) {
+      KALDI_ASSERT(!requests_[i]->need_model_derivative);
+      KALDI_ASSERT(requests_[i]->store_component_stats ==
+                   requests_[0]->store_component_stats);
+    }
+  }
+}
 
 void Compiler::CreateComputation(const CompilerOptions &opts,
                                  NnetComputation *computation) {
   computation->Clear();
-  ComputationGraphBuilder builder(nnet_, request_, &graph_);
-  builder.Compute();
-  if (!builder.AllOutputsAreComputable()) {
-    builder.ExplainWhyAllOutputsNotComputable();  // prints logging info
-    KALDI_ERR << "Not all outputs were computable, cannot create computation.";
+  ComputationGraphBuilder builder(nnet_, &graph_);
+  // note: there are only >1 segments in a 'looped' computation.
+  for (size_t segment = 0; segment < requests_.size(); segment++) {
+    builder.Compute(*(requests_[segment]));
+    if (!builder.AllOutputsAreComputable()) {
+      builder.ExplainWhyAllOutputsNotComputable();  // prints logging info
+      KALDI_ERR << "Not all outputs were computable, cannot create computation.";
+    }
+    builder.Prune();
   }
-  builder.Prune();
-  // see function declaration's comment for meaning of "phases".
-  std::vector<std::vector<int32> > phases;
-  ComputeComputationPhases(nnet_, graph_, &phases);
+  // see function declaration's comment for more on the meaning of "phases" (a
+  // phase will later be decomposed into one or more steps).  for each segment
+  // s, phases_per_segment[s] is a list of phases; each phase is a list of
+  // cindex_ids.
+  std::vector<std::vector<std::vector<int32> > > phases_per_segment;
+  ComputeComputationPhases(nnet_, graph_, &phases_per_segment);
   std::vector<std::vector<int32> > steps;
-  ComputeComputationSteps(nnet_, request_, phases, &graph_, &steps);
-  phases.clear();
-  CreateLocationInfo(steps);
+  steps.reserve(1000);
+
+  // maps each step to the segment in which it appears.  in the normal case
+  // (non-looped computation), a vector of all zeros.
+  std::vector<int32> step_to_segment;
+
+
+  {
+    // note: this class will output to 'steps' and to 'cindex_id_to_location_'.
+    // it may incidentally change 'graph_' by adding a few cindexes.
+    ComputationStepsComputer steps_computer(nnet_, &graph_, &steps,
+                                            &cindex_id_to_location_);
+
+    for (size_t segment = 0; segment < requests_.size(); segment++) {
+      steps_computer.ComputeForSegment(*(requests_[segment]),
+                                       phases_per_segment[segment]);
+      while (step_to_segment.size() < steps.size())
+        step_to_segment.push_back(segment);
+
+      // save memory, by deleting the phases we just consumed.  the
+      // following two lines just exist to save memory.
+      std::vector<std::vector<int32> > temp;
+      phases_per_segment[segment].swap(temp);
+    }
+    steps_computer.Check();
+  }
   std::vector<bool> deriv_needed;
-  ComputeDerivNeeded(steps, &deriv_needed);
-  CreateStepInfo(deriv_needed, &steps, computation);
-  AddCommands(deriv_needed, computation);
+  ComputeDerivNeeded(steps, step_to_segment, &deriv_needed);
+  CreateStepInfo(deriv_needed, step_to_segment, &steps, computation);
+  AddCommands(deriv_needed, step_to_segment, computation);
+  // the following command reorders commands so kAcceptInput and kProvideOutput
+  // appear in the desired places.
+  ConsolidateIoOperations(nnet_, computation);
   if (opts.output_debug_info)
     OutputDebugInfo(computation);
 }
 
 void Compiler::AddCommands(const std::vector<bool> &deriv_needed,
+                           const std::vector<int32> &step_to_segment,
                            NnetComputation *computation) {
-  SetInputOutputInfo(computation);
-  computation->need_model_derivative = request_.need_model_derivative;
+  computation->need_model_derivative = requests_[0]->need_model_derivative;
   int32 arbitrary_factor = 8;
   computation->commands.reserve(computation->matrices.size()
                                 * arbitrary_factor);
-  AllocateMatrices(computation);
-  SetUpPrecomputedIndexes(computation);
+
+  std::vector<int32> whole_submatrices;
+  computation->GetWholeSubmatrices(&whole_submatrices);
+  AllocateMatrices(whole_submatrices, computation);
+  SetUpPrecomputedIndexes(step_to_segment, computation);
   int32 num_steps = steps_.size();
-  for (int32 step = 0; step < num_steps; step++)
+  for (int32 step = 0; step < num_steps; step++) {
     DoForwardComputation(step, computation);
+    if (step + 1 < static_cast<int32>(step_to_segment.size()) &&
+        step_to_segment[step + 1] != step_to_segment[step]) {
+      // insert a marker that separates segments of the computation.
+      computation->commands.push_back(
+          NnetComputation::Command(kNoOperationMarker));
+    }
+  }
+
   // mark the end of the forward phase.
   computation->commands.push_back(
       NnetComputation::Command(kNoOperationMarker));
+
   for (int32 step = num_steps - 1; step >= 0; step--)
     if (deriv_needed[step])
       DoBackwardComputation(step, computation);
-  DeallocateMatrices(computation);
+
+  DeallocateMatrices(whole_submatrices, step_to_segment, computation);
 }
 
 
@@ -113,7 +175,11 @@ void Compiler::ComputeStepDependencies(
 
 void Compiler::ComputeDerivNeeded(
     const std::vector<std::vector<int32> > &steps,
+    const std::vector<int32> &step_to_segment,
     std::vector<bool> *deriv_needed) {
+  KALDI_ASSERT(steps.size() == step_to_segment.size() &&
+               step_to_segment[0] == 0 &&
+               step_to_segment.back() + 1 == requests_.size());
   deriv_needed->clear();
   int32 num_steps = steps.size();
   deriv_needed->resize(num_steps, false);
@@ -132,7 +198,7 @@ void Compiler::ComputeDerivNeeded(
 
     unordered_set<int32>::iterator iter = input_steps.begin(),
         end = input_steps.end();
-    // if some step that we depends on needs a derivative, we need the derivative.
+    // if some step that we depend on needs a derivative, we need the derivative.
     for (; iter != end; ++iter) {
       int32 dep_step = *iter;
       KALDI_ASSERT(dep_step < step);
@@ -141,33 +207,30 @@ void Compiler::ComputeDerivNeeded(
     }
     // if this step is an input and the user requested the derivative w.r.t. that
     // input, we need the derivative.
+    const ComputationRequest &request = *(requests_[step_to_segment[step]]);
+
     if (is_input) {
-      int32 input_index = request_.IndexForInput(node_name);
+      int32 input_index = request.IndexForInput(node_name);
       KALDI_ASSERT(input_index != -1);
-      if (request_.inputs[input_index].has_deriv)
+      if (request.inputs[input_index].has_deriv)
         (*deriv_needed)[step] = true;
     }
     // if this step is an output and the user is providing the derivative w.r.t. that
     // output, we need a place to store the derivative, so we set (*deriv_needed) to
     // true.
     if (nnet_.IsOutputNode(node_index)) {
-      int32 output_index = request_.IndexForOutput(node_name);
+      int32 output_index = request.IndexForOutput(node_name);
       KALDI_ASSERT(output_index != -1);
-      if (request_.outputs[output_index].has_deriv)
+      if (request.outputs[output_index].has_deriv)
         (*deriv_needed)[step] = true;
     }
-    // If this is an updatable Component node with a nonzero learning rate and
-    // the user requested model derivatives (e.g. during training), we need this
-    // step's derivative.
-    if (nnet_.IsComponentNode(node_index) && request_.need_model_derivative) {
+    // If this is an updatable Component node and the user requested model
+    // derivatives (e.g. during training), we need this step's derivative.
+    if (nnet_.IsComponentNode(node_index) && request.need_model_derivative) {
       const NetworkNode &node = nnet_.GetNode(node_index);
       const Component *c = nnet_.GetComponent(node.u.component_index);
-      if (c->Properties() & kUpdatableComponent) {
-        const UpdatableComponent *u = dynamic_cast<const UpdatableComponent*>(c);
-        KALDI_ASSERT(u != NULL);
-        if (u->LearningRate() != 0)
-          (*deriv_needed)[step] = true;
-      }
+      if (c->Properties() & kUpdatableComponent)
+        (*deriv_needed)[step] = true;
     }
   }
   if (GetVerboseLevel() >= 5) {
@@ -209,6 +272,7 @@ MatrixStrideType Compiler::GetStrideType(int32 node_index) const {
 // function destroys it.
 void Compiler::CreateStepInfo(
     const std::vector<bool> &deriv_needed,
+    const std::vector<int32> &step_to_segment,
     std::vector<std::vector<int32> > *by_step,
     NnetComputation *computation) {
   KALDI_ASSERT(!by_step->empty());
@@ -217,15 +281,29 @@ void Compiler::CreateStepInfo(
   for (int32 step = 0; step < num_steps; step++) {
     StepInfo &this_info = steps_[step];
     this_info.output_cindex_ids.swap((*by_step)[step]);
+    this_info.segment = step_to_segment[step];
     int32 num_ids = this_info.output_cindex_ids.size();
     this_info.output_indexes.resize(num_ids);
     for (int32 row_index = 0; row_index < num_ids; row_index++)
       this_info.output_indexes[row_index] =
           graph_.cindexes[this_info.output_cindex_ids[row_index]].second;
-    KALDI_ASSERT(num_ids > 0);
-    // node id's of all Cindexes are the same, so just use first one.
-    this_info.node_index =
-        graph_.cindexes[this_info.output_cindex_ids.front()].first;
+    if (num_ids > 0) {
+      // node id's of all Cindexes are the same, so just use first one.
+      this_info.node_index =
+          graph_.cindexes[this_info.output_cindex_ids.front()].first;
+    } else {
+      // it's possible to have an empty step if it's the component-input step of
+      // a GeneralComponent that does not always have dependencies, such as the
+      // ConstantFunctionComponent.  This is just a kind of placeholder; it will
+      // generate no commands.  The next command works because the next
+      // step will be the propagate for that Component, whose node-index is one
+      // more than the component-input node.
+      KALDI_ASSERT((step+1) < by_step->size() && !(*by_step)[step+1].empty());
+      this_info.node_index =
+          graph_.cindexes[(*by_step)[step+1][0]].first - 1;
+      KALDI_ASSERT(this_info.node_index >= 0);
+      continue;  // we don't need to do anything else for this step.
+    }
     const NetworkNode &node = nnet_.GetNode(this_info.node_index);
     int32 num_rows = num_ids, num_cols = node.Dim(nnet_);
 
@@ -298,60 +376,18 @@ void Compiler::CreateStepInfo(
   }
 }
 
-void Compiler::CreateLocationInfo(
-    const std::vector<std::vector<int32> > &by_step) {
-  cindex_id_to_location_.clear();
-  int32 num_cindex_ids = graph_.cindexes.size();
-  cindex_id_to_location_.resize(num_cindex_ids, std::pair<int32,int32>(-1,-1));
-  int32 num_steps = by_step.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    const std::vector<int32> &output_cindex_ids = by_step[step];
-    int32 num_rows = output_cindex_ids.size();
-    for (int32 row = 0; row < num_rows; row++) {
-      int32 cindex_id = output_cindex_ids[row];
-      if (cindex_id_to_location_[cindex_id].first != -1) {
-        int32 node_id = graph_.cindexes[cindex_id].first;
-        if (nnet_.GetNode(node_id).node_type != kDescriptor ||
-            nnet_.GetNode(node_id + 1).node_type != kComponent)
-          KALDI_ERR << "Cindexes may appear in >1 step only if they are "
-              "Descriptors for Component inputs: code error.";
-      }
-      cindex_id_to_location_[cindex_id] = std::pair<int32,int32>(step, row);
-    }
-  }
-}
-
-void Compiler::SetInputOutputInfo(NnetComputation *computation) const {
-  KALDI_ASSERT(computation->input_output_info.empty());
-  int32 num_steps = steps_.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    const StepInfo &this_info = steps_[step];
-    int32 node_index = this_info.node_index;
-    if (nnet_.IsInputNode(node_index) || nnet_.IsOutputNode(node_index)) {
-      // There should be only one step for each input or output node.
-      KALDI_ASSERT(computation->input_output_info.count(node_index) == 0);
-      int32 value_matrix_index =
-          computation->submatrices[this_info.value].matrix_index;
-      int32 deriv_matrix_index = 0;
-      if (this_info.deriv != 0)
-        deriv_matrix_index =
-            computation->submatrices[this_info.deriv].matrix_index;
-      computation->input_output_info[node_index] =
-          std::pair<int32,int32>(value_matrix_index, deriv_matrix_index);
-    }
-  }
-}
-
-
 void Compiler::DoForwardComputation(int32 step,
                                     NnetComputation *computation) const {
   KALDI_ASSERT(step < static_cast<int32>(steps_.size()));
   const StepInfo &step_info = steps_[step];
   const NetworkNode &node = nnet_.GetNode(step_info.node_index);
   switch (node.node_type) {
-    case kInput: case kDimRange: break;  // Nothing to do.
+    case kInput:  // Note: input nodes appear before other node types.
+      AddForwardStepInput(step, computation);
+      break;
+    case kDimRange: break;  // Nothing to do.
     case kComponent:
-      AddPropagateStep(step, computation);
+      AddForwardStepComponent(step, computation);
       break;
     case kDescriptor:
       DoForwardComputationDescriptor(step, computation);
@@ -367,6 +403,17 @@ void Compiler::DoForwardComputationDescriptor(
   int32 num_parts = steps_[step].value_parts.size();
   for (int32 part = 0; part < num_parts; part++)
     DoForwardComputationSumDescriptor(step, part, computation);
+  const StepInfo &step_info = steps_[step];
+  if (nnet_.IsOutputNode(step_info.node_index)) {
+    // If the node is an output then we need to add commands to provide the
+    // output to the user, and possibly to get derivatives w.r.t. the output
+    // from the user.
+    int32 node_index = step_info.node_index,
+        submatrix_index = step_info.value;
+    KALDI_ASSERT(computation->IsWholeMatrix(submatrix_index));
+    NnetComputation::Command c(kProvideOutput, submatrix_index, node_index);
+    computation->commands.push_back(c);
+  }
 }
 
 
@@ -510,10 +557,6 @@ void Compiler::DoForwardComputationFromIndexes(
   }
   // if we got to here, it's not just a case of matrix-copy or matrix-add,
   // but it's still from a single source matrix.
-  // TODO: detect the case where the indexes are contiguous, but possibly
-  // with -1's at the beginning or end (e.g. [ -1 2 3 4 5 6 7 8 ]) and make
-  // it a standard matrix-copy command with new sub-matrices added as needed,
-  // possibly with a subset of the rows in the original sub-matrices.
   int32 indexes_index = computation->indexes.size();
   computation->indexes.push_back(indexes);
   CommandType ctype =
@@ -613,16 +656,22 @@ void Compiler::DoBackwardComputationFromSubmatLocations(
   // trickier to implement efficiently on the GPU, there may be cases
   // which we will refuse to implement backprop for if we get here.
 
-
-
-  int32 first_value;
-  std::vector<int32> second_values;
-  if (ConvertToIndexes(submat_locations, &first_value,
-                       &second_values)) {
-    int32 input_deriv_submatrix_index = first_value;
+  int32 num_rows = submat_locations.size();
+  std::vector<std::pair<int32, int32> >::const_iterator
+      iter = submat_locations.begin(), end = submat_locations.end();
+  int32 first_submat = iter->first;
+  for (++iter; iter != end; ++iter)
+    if (iter->first != first_submat)
+      break;
+  bool all_same_submatrix = (iter == end);
+  if (all_same_submatrix) {
+    int32 input_deriv_submatrix_index = first_submat;
+    std::vector<int32> indexes(num_rows);
+    for (int32 i = 0; i < num_rows; i++)
+      indexes[i] = submat_locations[i].second;
     DoBackwardComputationFromIndexes(deriv_submatrix_index,
                                      input_deriv_submatrix_index,
-                                     second_values,
+                                     indexes,
                                      computation);
     return;
   } else {
@@ -740,6 +789,15 @@ void Compiler::DoBackwardComputationFromIndexes(
 void Compiler::DoBackwardComputationDescriptor(
     int32 step, NnetComputation *computation) {
   StepInfo &step_info = steps_[step];
+  if (nnet_.IsOutputNode(step_info.node_index) &&
+      step_info.deriv > 0) {
+    int32 deriv_submatrix_index = step_info.deriv;
+    KALDI_ASSERT(computation->IsWholeMatrix(deriv_submatrix_index));
+    NnetComputation::Command c(kAcceptInput, deriv_submatrix_index,
+                               step_info.node_index);
+    computation->commands.push_back(c);
+  }
+
   // the top-level descriptor has a bunch of parts that we concatenate features
   // over.
   int32 num_parts = step_info.value_parts.size();
@@ -757,9 +815,13 @@ void Compiler::DoBackwardComputation(int32 step,
   const NetworkNode &node = nnet_.GetNode(node_index);
 
   switch (node.node_type) {
-    case kInput: case kDimRange: break;  // Nothing to do.
+    case kInput:
+      AddBackwardStepInput(step, computation);
+      break;
+    case kDimRange:
+      break;  // Nothing to do.
     case kComponent:
-      AddBackpropStep(step, computation);
+      AddBackwardStepComponent(step, computation);
       break;
     case kDescriptor:
       DoBackwardComputationDescriptor(step, computation);
@@ -769,9 +831,28 @@ void Compiler::DoBackwardComputation(int32 step,
   }
 }
 
+// This just adds a command of type kAcceptInput that directs the computer to
+// expect input from the user.  Because inputs are always listed first in
+// 'steps', these will precede the actual commands.
+void Compiler::AddForwardStepInput(int32 step,
+                                   NnetComputation *computation) const {
+  KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
+  const StepInfo &step_info = steps_[step];
+  int32 node_index = step_info.node_index,
+      submatrix_index = step_info.value;
+  KALDI_ASSERT(computation->IsWholeMatrix(submatrix_index));
 
-void Compiler::AddPropagateStep(int32 step,
-                                NnetComputation *computation) const {
+  const NetworkNode &node = nnet_.GetNode(node_index);
+  // actually currently the node type would always be kInput.
+  KALDI_ASSERT(node.node_type == kInput || node.node_type == kComponent);
+
+  NnetComputation::Command c(kAcceptInput, submatrix_index, node_index);
+  computation->commands.push_back(c);
+}
+
+
+void Compiler::AddForwardStepComponent(int32 step,
+                                       NnetComputation *computation) const {
   KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
   const StepInfo &step_info = steps_[step];
   int32 input_step = step - 1;
@@ -780,9 +861,6 @@ void Compiler::AddPropagateStep(int32 step,
   const NetworkNode &node = nnet_.GetNode(node_index);
   KALDI_ASSERT(node.node_type == kComponent);
 
-  // in setting the following two variables, we use the fact that the submatrix
-  // index of each submatrix that represents an entire matrix, is the same as
-  // the matrix index of that matrix.
   int32 input_submatrix_index = input_step_info.value,
       output_submatrix_index = step_info.value;
   NnetComputation::Command c(kPropagate,
@@ -792,7 +870,7 @@ void Compiler::AddPropagateStep(int32 step,
                              output_submatrix_index);
   computation->commands.push_back(c);
 
-  if (request_.store_component_stats) {
+  if (requests_[0]->store_component_stats) {
     const Component *c = nnet_.GetComponent(node.u.component_index);
     if (c->Properties() & kStoresStats) {
       NnetComputation::Command c(kStoreStats,
@@ -804,8 +882,26 @@ void Compiler::AddPropagateStep(int32 step,
 }
 
 
-void Compiler::AddBackpropStep(int32 step,
-                               NnetComputation *computation) const {
+void Compiler::AddBackwardStepInput(int32 step,
+                                    NnetComputation *computation) const {
+  KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
+  const StepInfo &step_info = steps_[step];
+  int32 node_index = step_info.node_index,
+      deriv_submatrix_index = step_info.deriv;
+  if (deriv_submatrix_index == 0)
+    return;  // Nothing to do.
+  KALDI_ASSERT(computation->IsWholeMatrix(deriv_submatrix_index));
+  const NetworkNode &node = nnet_.GetNode(node_index);
+  // actually, currently the node type would always be kInput.
+  KALDI_ASSERT(node.node_type == kInput || node.node_type == kComponent);
+
+  NnetComputation::Command c(kProvideOutput, deriv_submatrix_index, node_index);
+  computation->commands.push_back(c);
+}
+
+
+void Compiler::AddBackwardStepComponent(int32 step,
+                                        NnetComputation *computation) const {
   KALDI_ASSERT(static_cast<size_t>(step) < steps_.size());
   const StepInfo &step_info = steps_[step];
   int32 input_step = step - 1;
@@ -816,9 +912,6 @@ void Compiler::AddBackpropStep(int32 step,
   int32 component_index = node.u.component_index;
   const Component *component = nnet_.GetComponent(component_index);
 
-  // in setting the following two variables, we use the fact that the submatrix
-  // index of each submatrix that represents an entire matrix, is the same as
-  // the matrix index of that matrix.
   int32 input_submatrix_index = input_step_info.value,
       output_submatrix_index = step_info.value,
       input_deriv_submatrix_index = input_step_info.deriv,
@@ -844,7 +937,8 @@ void Compiler::AddBackpropStep(int32 step,
 
 
 
-void Compiler::AllocateMatrices(NnetComputation *computation) const {
+void Compiler::AllocateMatrices(const std::vector<int32> &whole_submatrices,
+                                NnetComputation *computation) const {
   KALDI_ASSERT(computation->commands.empty());
   // Work out which matrices are inputs to the computation (or output-derivs,
   // which are also supplied as inputs to the computation); we won't be setting
@@ -873,14 +967,17 @@ void Compiler::AllocateMatrices(NnetComputation *computation) const {
     }
   }
 
-  for (int32 m = 1; m < computation->matrices.size(); m++) {
+  int32 num_matrices = computation->matrices.size();
+  for (int32 m = 1; m < num_matrices; m++) {
     // Later in the optimization phase, it turns out that zeroing is not
     // necessary for some matrices, we'll turn these commands into
     // kAllocMatrixUndefined.
     // We don't set up the matrices that are inputs to the computation;
     // this happens when the user provides the input.
     if (input_and_oderiv_matrices.count(m) == 0) {
-      NnetComputation::Command c(kAllocMatrixZeroed, m);
+      // get a submatrix index that refers to the entire matrix.
+      int32 submatrix_index = whole_submatrices[m];
+      NnetComputation::Command c(kAllocMatrixZeroed, submatrix_index);
       computation->commands.push_back(c);
     }
   }
@@ -888,10 +985,12 @@ void Compiler::AllocateMatrices(NnetComputation *computation) const {
 
 
 void Compiler::SetUpPrecomputedIndexes(
+    const std::vector<int32> &step_to_segment,
     NnetComputation *computation) {
   int32 num_steps = steps_.size();
   KALDI_ASSERT(computation->component_precomputed_indexes.empty());
-  computation->component_precomputed_indexes.push_back(NULL);
+  // the zeroth commponent is special, contains a NULL pointer.
+  computation->component_precomputed_indexes.resize(1);
   for (int32 step = 0; step < num_steps; step++) {
     StepInfo &step_info = steps_[step];
     int32 node_index = step_info.node_index;
@@ -908,9 +1007,10 @@ void Compiler::SetUpPrecomputedIndexes(
 
     const Component *component = nnet_.GetComponent(component_index);
 
-    bool need_derivs = request_.NeedDerivatives();
+    const ComputationRequest &request = *(requests_[step_to_segment[step]]);
+    bool need_derivs = request.NeedDerivatives();
     ComponentPrecomputedIndexes *precomputed_indexes =
-        component->PrecomputeIndexes(request_.misc_info,
+        component->PrecomputeIndexes(request.misc_info,
                                      input_indexes, output_indexes,
                                      need_derivs);
     if (precomputed_indexes == NULL) {
@@ -920,13 +1020,27 @@ void Compiler::SetUpPrecomputedIndexes(
     } else {
       step_info.precomputed_indexes_index =
           computation->component_precomputed_indexes.size();
-      computation->component_precomputed_indexes.push_back(precomputed_indexes);
+
+      NnetComputation::PrecomputedIndexesInfo info;
+      info.data = precomputed_indexes;
+
+      if (!input_indexes.empty() && input_indexes.back().n == 1 &&
+          !output_indexes.empty() && output_indexes.back().n == 1) {
+        // If these conditions are true, it's *possible* that we are doing
+        // 'shortcut' compilation.  So just in case that's what's going on, we
+        // store 'input_indexes' and 'output_indexes, which are needed by
+        // the ExpandComputation() function that is used in that process.
+        info.input_indexes = input_indexes;
+        info.output_indexes = output_indexes;
+      }
+      computation->component_precomputed_indexes.push_back(info);
     }
   }
 }
 
-
-void Compiler::DeallocateMatrices(NnetComputation *computation) {
+void Compiler::DeallocateMatrices(const std::vector<int32> &whole_submatrices,
+                                  const std::vector<int32> &step_to_segment,
+                                  NnetComputation *computation) {
   // This adds the commands to destroy all the matrices- but not the
   // ones that might be needed as outputs of the computation.  The ones that
   // are spared from destruction are those corresponding to outputs of the
@@ -938,6 +1052,7 @@ void Compiler::DeallocateMatrices(NnetComputation *computation) {
   int32 num_steps = steps_.size();
   for (int32 step = 0; step < num_steps; step++) {
     const StepInfo &step_info = steps_[step];
+    const ComputationRequest &request = *(requests_[step_to_segment[step]]);
     if (nnet_.IsOutputNode(step_info.node_index)) {
       // steps corresponding to output nodes need to have their "value" kept.
       int32 value_matrix_index =
@@ -949,11 +1064,11 @@ void Compiler::DeallocateMatrices(NnetComputation *computation) {
       // need to worry about whether outputs were requested, because if they
       // were not requested we would not be computing them in the first place).
       std::string input_name = nnet_.GetNodeNames()[step_info.node_index];
-      int32 i = 0, num_inputs = request_.inputs.size();
+      int32 i = 0, num_inputs = request.inputs.size();
       bool has_deriv = false;
       for (; i < num_inputs; i++) {
-        if (input_name == request_.inputs[i].name) {
-          has_deriv = request_.inputs[i].has_deriv;
+        if (input_name == request.inputs[i].name) {
+          has_deriv = request.inputs[i].has_deriv;
           break;
         }
       }
@@ -967,10 +1082,13 @@ void Compiler::DeallocateMatrices(NnetComputation *computation) {
     }
   }
   // note: matrix-index 0 is the empty matrix.
-  for (int32 m = 1; m < num_matrices; m++)
-    if (will_destroy[m])
+  for (int32 m = 1; m < num_matrices; m++) {
+    if (will_destroy[m]) {
+      int32 submatrix_index = whole_submatrices[m];
       computation->commands.push_back(
-          NnetComputation::Command(kDeallocMatrix, m));
+          NnetComputation::Command(kDeallocMatrix, submatrix_index));
+    }
+  }
 }
 
 void Compiler::OutputDebugInfo(NnetComputation *computation) const {
@@ -979,7 +1097,8 @@ void Compiler::OutputDebugInfo(NnetComputation *computation) const {
   computation->matrix_debug_info.resize(num_matrices);
   for (int32 step = 0; step < num_steps; step++) {
     const StepInfo &step_info = steps_[step];
-    KALDI_ASSERT(step_info.value != 0);
+    if (step_info.value == 0)
+      continue;  // e.g. input step for ConstantComponent.
     if (!computation->IsWholeMatrix(step_info.value))
       continue;
     int32 value_matrix = computation->submatrices[step_info.value].matrix_index;
diff --git a/src/nnet3/nnet-compile.h b/src/nnet3/nnet-compile.h
index 2d187bb6876..20114206ceb 100644
--- a/src/nnet3/nnet-compile.h
+++ b/src/nnet3/nnet-compile.h
@@ -1,6 +1,6 @@
 // nnet3/nnet-compile.h
 
-// Copyright 2015    Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015-2016    Johns Hopkins University (author: Daniel Povey)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -43,14 +43,23 @@ struct CompilerOptions {
 /// nnet-optimize.h.
 class Compiler {
  public:
+  // Constructor that takes one computation request (this is the normal case).
   Compiler(const ComputationRequest &request,
            const Nnet &nnet);
 
+  // Constructor with a sequence of computation requests, for multiple
+  // computation segments (used when creating online computations).
+  Compiler(const std::vector<const ComputationRequest*> &request,
+           const Nnet &nnet);
+
   void CreateComputation(const CompilerOptions &opts,
                          NnetComputation *computation);
 
  private:
-  const ComputationRequest &request_;
+  // requests_ is the sequence of computation requests, one for each segment; it
+  // will contain just one element in the normal case, but more when we're
+  // compiling a multi-segment / 'online' computation.
+  std::vector<const ComputationRequest*> requests_;
   const Nnet &nnet_;
   ComputationGraph graph_;
 
@@ -60,12 +69,16 @@ class Compiler {
   // multiple commands.
   struct StepInfo {
     int32 node_index;  // network-node index
-    bool is_input;  // true if step corresponds to an input to the computation.
     int32 value;  // sub-matrix index of value that this step outputs.
     int32 deriv;  // sub-matrix index of derivative at the output of this step; zero
                   // if not used (note: index zero is reserved for the empty
                   // matrix).
 
+    int32 segment;  // normally 0 except for online/multi-segment computations,
+                    // identifies the segment of which this step is a part (each
+                    // segment in the sequence has a different
+                    // ComputationRequest).
+
     // precomputed_indexes_index is the index into the
     // component_precomputed_indexes array in the NnetComputation, or zero if
     // none needed.
@@ -93,14 +106,10 @@ class Compiler {
     // backprop.
     std::vector<std::vector<std::vector<std::pair<int32,int32> > > > input_locations_list;
 
-    StepInfo(): node_index(-1), is_input(false), value(0),
-                deriv(0), precomputed_indexes_index(0) { }
+    StepInfo(): node_index(-1), value(0), deriv(0), segment(0),
+                precomputed_indexes_index(0) { }
   };
 
-  // this sets up cindex_id_to_location_.
-  void CreateLocationInfo(const std::vector<std::vector<int32> > &by_step);
-
-
   // Computes the set of step-indexes of preceding steps that this step depends
   // on.  Assumes CreateLocationInfo() has already been called.  Requires
   // 'step_index' only to handle a special case, that if 'this_step' is a
@@ -114,12 +123,19 @@ class Compiler {
   // whether, for that step, we need to allocate the matrix of derivatives
   // (interpret this as being at the output of that step).  This variable
   // also tells us whether we need to execute the backprop code for that step.
+  //  'steps' is a vector of steps; each step is a list of cindexes.
+  //  'step_to_segment', which should have the same dimension as 'steps',
+  //    maps from step index to the segment it occurs in (only interesting
+  //    for multi-segment/online computations).
+  //  'deriv_needed' will be given the same length as 'steps'.
   void ComputeDerivNeeded(const std::vector<std::vector<int32> > &steps,
+                          const std::vector<int32> &step_to_segment,
                           std::vector<bool> *deriv_needed);
 
   // this sets up steps_, destroying the input "by_step" in the process.  It
   // also sets various matrix and sub-matrix sizes in "computation".
   void CreateStepInfo(const std::vector<bool> &deriv_needed,
+                      const std::vector<int32> &step_to_segment,
                       std::vector<std::vector<int32> > *by_step,
                       NnetComputation *computation);
 
@@ -144,21 +160,21 @@ class Compiler {
   // Adds to the computation object the information about the matrix sizes
   void DefineMatrices(NnetComputation *computation) const;
 
-  // sets up the input_output_info of the computation (this says where the
-  // values and derivatives for the inputs and outputs live).
-  void SetInputOutputInfo(NnetComputation *computation) const;
-
   // Sets up sub-matrix indexes for nodes of type Descriptor (needed mainly
   // because Descriptors in general have many parts corresponding to
   // feature-dimension ranges, and they live in sub-matrices.
   void DefineSubmatrices(NnetComputation *computation);
 
   // Adds to the computation object the commands to allocate the matrices.
-  void AllocateMatrices(NnetComputation *computation) const;
+  // 'whole_submatrices' is as created by computation->GetWholeSubmatrices(), it
+  // gives us the index of a submatrix containing the whole of each matrix.
+  void AllocateMatrices(const std::vector<int32> &whole_submatrices,
+                        NnetComputation *computation) const;
 
   // Sets up the precomputed indexes for each component, and sets the
   // precomputed_indexes_index value for each step.
-  void SetUpPrecomputedIndexes(NnetComputation *computation);
+  void SetUpPrecomputedIndexes(const std::vector<int32> &step_to_segment,
+                               NnetComputation *computation);
 
   // Adds to "computation" the command(s) for the forward computation
   // for this step.
@@ -166,7 +182,11 @@ class Compiler {
 
   // Called from DoForwardComputation, handles the case where the step corresponds
   // to a Component.
-  void AddPropagateStep(int32 step, NnetComputation *computation) const;
+  void AddForwardStepComponent(int32 step, NnetComputation *computation) const;
+
+  // Called from DoForwardComputation, handles the case where the step corresponds
+  // to an input node.
+  void AddForwardStepInput(int32 step, NnetComputation *computation) const;
 
 
   // Called from DoForwardComputation, handles the case where the step
@@ -247,7 +267,12 @@ class Compiler {
 
   // Called from DoBackwardComputation, handles the case where the step corresponds
   // to a Component.
-  void AddBackpropStep(int32 step, NnetComputation *computation) const;
+  void AddBackwardStepComponent(int32 step, NnetComputation *computation) const;
+
+  // Called from DoBackwardComputation, handles the case where the step
+  // corresponds to an input.  If applicable, this generates a command for the
+  // network to provide the derivative w.r.t. the input, to the user.
+  void AddBackwardStepInput(int32 step, NnetComputation *computation) const;
 
   // Called from DoBackwardComputation, handles the case where the step
   // corresponds to type kDescriptor.
@@ -285,12 +310,17 @@ class Compiler {
   // deinitialize all the matrices, except those that may be requested by
   // the user after the computation is done (i.e. outputs of the network,
   // and input derivatives).
-  void DeallocateMatrices(NnetComputation *computation);
+  // 'whole_submatrices' is as created by computation->GetWholeSubmatrices(), it
+  // gives us the index of a submatrix containing the whole of each matrix.
+  void DeallocateMatrices(const std::vector<int32> &whole_submatrices,
+                          const std::vector<int32> &step_to_segment,
+                          NnetComputation *computation);
 
   // sets up the debug_info member of "computation".
   void OutputDebugInfo(NnetComputation *computation) const;
 
   void AddCommands(const std::vector<bool> &deriv_needed,
+                   const std::vector<int32> &step_to_segment,
                    NnetComputation *computation);
 
 };
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 00dd802e091..23a8662a0d5 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -143,6 +143,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new StatisticsPoolingComponent();
   } else if (component_type == "ConstantFunctionComponent") {
     ans = new ConstantFunctionComponent();
+  } else if (component_type == "ConstantComponent") {
+    ans = new ConstantComponent();
   } else if (component_type == "DropoutComponent") {
     ans = new DropoutComponent();
   } else if (component_type == "BackpropTruncationComponent") {
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index e5974b46f46..c1732fc9b25 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -355,7 +355,7 @@ class RandomComponent: public Component {
   // This function is required in testing code and in other places we need
   // consistency in the random number generation (e.g. when optimizing
   // validation-set performance), but check where else we call srand().  You'll
-  // need to call srand as well as making this call.
+  // need to call srand prior to making this call.
   void ResetGenerator() { random_generator_.SeedGpu(); }
  protected:
   CuRand<BaseFloat> random_generator_;
@@ -375,11 +375,6 @@ class UpdatableComponent: public Component {
       learning_rate_factor_(other.learning_rate_factor_),
       is_gradient_(other.is_gradient_), max_change_(other.max_change_) { }
 
-  /// \brief Sets parameters to zero, and if treat_as_gradient is true,
-  ///  sets is_gradient_ to true and sets learning_rate_ to 1, ignoring
-  ///  learning_rate_factor_.
-  virtual void SetZero(bool treat_as_gradient) = 0;
-
   UpdatableComponent(): learning_rate_(0.001), learning_rate_factor_(1.0),
                         is_gradient_(false), max_change_(0.0) { }
 
@@ -403,6 +398,10 @@ class UpdatableComponent: public Component {
   /// Sets the learning rate directly, bypassing learning_rate_factor_.
   virtual void SetActualLearningRate(BaseFloat lrate) { learning_rate_ = lrate; }
 
+  /// \brief Sets is_gradient_ to true and sets learning_rate_ to 1, ignoring
+  /// learning_rate_factor_.
+  virtual void SetAsGradient() { learning_rate_ = 1.0; is_gradient_ = true; }
+
   /// Gets the learning rate of gradient descent.  Note: if you call
   /// SetLearningRate(x), and learning_rate_factor_ != 1.0,
   /// a different value than x will returned.
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index 3cc6af1c70d..fdc9849dfc2 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -25,9 +25,9 @@ namespace kaldi {
 namespace nnet3 {
 // Reset seeds for test time for RandomComponent
 static void ResetSeed(int32 rand_seed, const Component &c) {
-  RandomComponent *rand_component = 
+  RandomComponent *rand_component =
     const_cast<RandomComponent*>(dynamic_cast<const RandomComponent*>(&c));
-  
+
   if (rand_component != NULL) {
     srand(rand_seed);
     rand_component->ResetGenerator();
@@ -48,8 +48,10 @@ static bool StringsApproxEqual(const std::string &a,
       // if it's not the last digit in the string, goto fail
       if (pos + 1 != size && isdigit(a[pos+1]))
         goto fail;
+      if (pos == 0)
+        goto fail;
       size_t pos2;
-      for (pos2 = pos - 1; pos2 > 0; pos2--) {
+      for (pos2 = static_cast<ssize_t>(pos) - 1; pos2 > 0; pos2--) {
         if (a[pos2] == '.') break;  // we accept this difference: we went backwards and found a '.'
         if (!isdigit(a[pos2]))  // we reject this difference: we went back and
                                 // found non-digit before '.' -> not floating
@@ -106,7 +108,7 @@ void TestNnetComponentVectorizeUnVectorize(Component *c) {
   UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(c);
   KALDI_ASSERT(uc != NULL);
   UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy());
-  uc2->SetZero(false);
+  uc2->Scale(0.0);
   Vector<BaseFloat> params(uc2->NumParameters());
   uc2->Vectorize(&params);
   KALDI_ASSERT(params.Min()==0.0 && params.Sum()==0.0);
@@ -144,14 +146,14 @@ void TestNnetComponentUpdatable(Component *c) {
   }
   if(!(uc->Properties() & kUpdatableComponent)){
     // testing that if it declares itself as non-updatable,
-    // Scale() and Add() and SetZero() have no effect.
+    // Scale() and Add() have no effect.
     KALDI_ASSERT(uc->NumParameters() == 0);
     KALDI_ASSERT(uc->DotProduct(*uc) == 0);
     UpdatableComponent *uc2 = dynamic_cast<UpdatableComponent*>(uc->Copy());
     uc2->Scale(7.0);
     uc2->Add(3.0, *uc);
     KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
-    uc->SetZero(false);
+    uc->Scale(0.0);
     KALDI_ASSERT(StringsApproxEqual(uc2->Info(), uc->Info()));
     delete uc2;
   } else {
@@ -177,13 +179,13 @@ void TestNnetComponentUpdatable(Component *c) {
     uc3->Scale(0.5);
     KALDI_ASSERT(uc2->Info() == uc3->Info());
 
-    // testing that SetZero() works the same whether done on the vectorized
+    // testing that Scale(0.0) works the same whether done on the vectorized
     // paramters or via SetZero(), and that unvectorizing something that's been
     // zeroed gives us zero parameters.
     uc2->Vectorize(&vec2);
     vec2.SetZero();
     uc2->UnVectorize(vec2);
-    uc3->SetZero(false);
+    uc3->Scale(0.0);
     uc3->Vectorize(&vec2);
     KALDI_ASSERT(uc2->Info() == uc3->Info() && VecVec(vec2, vec2) == 0.0);
 
@@ -198,7 +200,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
   int32 properties = c.Properties();
   Component *c_copy = NULL, *c_copy_scaled = NULL;
   int32 rand_seed = Rand();
- 
+
   if (RandInt(0, 1) == 0)
     c_copy = c.Copy();  // This will test backprop with an updatable component.
   if (RandInt(0, 1) == 0 &&
@@ -234,7 +236,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) {
   if ((properties & kPropagateAdds) && (properties & kPropagateInPlace)) {
     KALDI_ERR << "kPropagateAdds and kPropagateInPlace flags are incompatible.";
   }
-  
+
   ResetSeed(rand_seed, c);
   c.Propagate(NULL, input_data, &output_data1);
 
@@ -327,7 +329,7 @@ bool TestSimpleComponentDataDerivative(const Component &c,
       output_deriv(num_rows, output_dim, kSetZero, output_stride_type);
   input_data.SetRandn();
   output_deriv.SetRandn();
- 
+
   ResetSeed(rand_seed, c);
   c.Propagate(NULL, input_data, &output_data);
 
@@ -420,8 +422,8 @@ bool TestSimpleComponentModelDerivative(const Component &c,
   UpdatableComponent *uc_copy = dynamic_cast<UpdatableComponent*>(c_copy);
   KALDI_ASSERT(uc != NULL && uc_copy != NULL);
   if (test_derivative) {
-    bool is_gradient = true;
-    uc_copy->SetZero(is_gradient);
+    uc_copy->Scale(0.0);
+    uc_copy->SetAsGradient();
   }
 
   CuMatrix<BaseFloat> input_deriv(num_rows, input_dim,
@@ -522,8 +524,9 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
   TestStringsApproxEqual();
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
+  kaldi::int32 loop = 0;
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     //CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -531,9 +534,11 @@ int main() {
       CuDevice::Instantiate().SelectGpuId("yes");
 #endif
     UnitTestNnetComponent();
-  }
-
-  KALDI_LOG << "Nnet component ntests succeeded.";
+#if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  KALDI_LOG << "Nnet component tests succeeded.";
 
   return 0;
 }
diff --git a/src/nnet3/nnet-computation-graph.cc b/src/nnet3/nnet-computation-graph.cc
index 43427fb39e4..7c20f3ae711 100644
--- a/src/nnet3/nnet-computation-graph.cc
+++ b/src/nnet3/nnet-computation-graph.cc
@@ -54,52 +54,72 @@ int32 ComputationGraph::GetCindexId(const Cindex &cindex) const {
 }
 
 
-void ComputationGraph::Renumber(const std::vector<bool> &keep) {
-  int32 num_cindex_ids = cindexes.size();
-  KALDI_ASSERT(keep.size() == num_cindex_ids);
-  ComputationGraph temp_graph;
-  std::vector<int32> old2new(num_cindex_ids, -1), new2old;
-  new2old.reserve(num_cindex_ids);
-  for (int32 j = 0; j < num_cindex_ids; j++) {
+void ComputationGraph::Renumber(int32 start_cindex_id,
+                                const std::vector<bool> &keep) {
+  int32 old_num_cindex_ids = cindexes.size();
+  KALDI_ASSERT(keep.size() == old_num_cindex_ids - start_cindex_id);
+  // count_before_renumbering is the number of cindex_ids >= start_cindex_id,
+  // before renumbering.
+  int32 count_before_renumbering = old_num_cindex_ids - start_cindex_id;
+  std::vector<int32> old2new(count_before_renumbering, -1), new2old;
+  new2old.reserve(old_num_cindex_ids);
+  for (int32 j = 0; j < count_before_renumbering; j++) {
     if (keep[j]) {
-      old2new[j] = new2old.size();
-      new2old.push_back(j);
+      old2new[j] = new2old.size() + start_cindex_id;
+      new2old.push_back(j + start_cindex_id);
     }
   }
-  int32 new_num_cindex_ids = new2old.size();
-  if (new_num_cindex_ids == num_cindex_ids) {
+  // count_after_renumbering is the number of cindex_ids >= start_cindex_id,
+  // after renumbering.
+  int32 count_after_renumbering = new2old.size(),
+      new_num_cindex_ids = start_cindex_id + count_after_renumbering;
+  if (count_after_renumbering == count_before_renumbering) {
     // this is an optimization for when we are not deleting any
     // cindex-ids.
     return;
   }
-  temp_graph.cindexes.resize(new_num_cindex_ids);
-  temp_graph.is_input.resize(new_num_cindex_ids);
-  temp_graph.dependencies.resize(new_num_cindex_ids);
-  for (int32 c = 0; c < new_num_cindex_ids; c++) {
-    int32 d = new2old[c];
-    temp_graph.cindexes[c] = cindexes[d];
-    temp_graph.is_input[c] = is_input[d];
-    temp_graph.dependencies[c].reserve(dependencies[d].size());
+
+  for (int32 old_cindex_id = start_cindex_id;
+       old_cindex_id < old_num_cindex_ids; old_cindex_id++) {
+    int32 new_cindex_id = old2new[old_cindex_id - start_cindex_id];
+    Cindex &cindex = cindexes[old_cindex_id];
+    if (new_cindex_id == -1) {
+      cindex_to_cindex_id_.erase(cindex);
+    } else if (new_cindex_id != old_cindex_id) {
+      cindex_to_cindex_id_[cindex] = new_cindex_id;
+    }
+  }
+
+  std::vector<int32> temp;
+  for (int32 c = start_cindex_id; c < new_num_cindex_ids; c++) {
+    int32 d = new2old[c - start_cindex_id];
+    // note: d >= c, which is why we do not overwrite data here.
+    KALDI_PARANOID_ASSERT(d >= c);
+    cindexes[c] = cindexes[d];
+    is_input[c] = is_input[d];
+    // if c == d, we need to create a temporary copy.
+    const std::vector<int32> &src_dependencies =
+        (c == d ? (temp = dependencies[d]) : dependencies[d]);
     std::vector<int32>::const_iterator
-        iter = dependencies[d].begin(), end = dependencies[d].end();
+        iter = src_dependencies.begin(), end = src_dependencies.end();
+    dependencies[c].clear();
     for (; iter != end; ++iter) {
-      int32 old_dep = *iter, new_dep = old2new[old_dep];
-      if (new_dep != -1)
-        temp_graph.dependencies[c].push_back(new_dep);
-      else
-        KALDI_ERR << "Dependency on nonexistent cindex-id";
+      int32 old_dep = *iter;
+      if (old_dep < start_cindex_id) {
+        dependencies[c].push_back(old_dep);
+      } else {
+        int32 new_dep = old2new[old_dep - start_cindex_id];
+        if (new_dep != -1)
+          dependencies[c].push_back(new_dep);
+        else
+          KALDI_ERR << "Dependency on nonexistent cindex-id";
+      }
     }
   }
 
-  // at this point, rather than setting up cindex_to_cindex_id_ on the temporary
-  // graph, we copy cindexes, is_input and dependencies to this graph, and then
-  // set up cindex_to_cindex_id_ locally.
-  cindexes.swap(temp_graph.cindexes);
-  is_input.swap(temp_graph.is_input);
-  dependencies.swap(temp_graph.dependencies);
-  cindex_to_cindex_id_.clear();
-  for (int32 c = 0; c < new_num_cindex_ids; c++)
-    cindex_to_cindex_id_[cindexes[c]] = c;
+  cindexes.resize(new_num_cindex_ids);
+  is_input.resize(new_num_cindex_ids);
+  dependencies.resize(new_num_cindex_ids);
 }
 
 void ComputationGraphBuilder::PrintCindexId(std::ostream &os,
@@ -229,17 +249,17 @@ void ComputationGraphBuilder::AddCindexId(int32 cindex_id,
 
 void ComputationGraphBuilder::AddInputs() {
   int32 num_added = 0;
-  for (int32 i = 0; i < request_.inputs.size(); i++) {
-    int32 n = nnet_.GetNodeIndex(request_.inputs[i].name);
+  for (int32 i = 0; i < request_->inputs.size(); i++) {
+    int32 n = nnet_.GetNodeIndex(request_->inputs[i].name);
     if (n == -1)
       KALDI_ERR << "Network has no input with name "
-                << request_.inputs[i].name;
+                << request_->inputs[i].name;
     NodeType t = nnet_.GetNode(n).node_type;
     KALDI_ASSERT((t == kInput || t == kComponent) &&
                  "Inputs to graph only allowed for Input and Component nodes.");
 
-    for (int32 j = 0; j < request_.inputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request_.inputs[i].indexes[j]);
+    for (int32 j = 0; j < request_->inputs[i].indexes.size(); j++) {
+      Cindex cindex(n, request_->inputs[i].indexes[j]);
       bool is_input = true, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Input index seems to be listed more than once");
@@ -252,13 +272,13 @@ void ComputationGraphBuilder::AddInputs() {
 
 void ComputationGraphBuilder::AddOutputs() {
   int32 num_added = 0;
-  for (int32 i = 0; i < request_.outputs.size(); i++) {
-    int32 n = nnet_.GetNodeIndex(request_.outputs[i].name);
+  for (int32 i = 0; i < request_->outputs.size(); i++) {
+    int32 n = nnet_.GetNodeIndex(request_->outputs[i].name);
     if (n == -1)
       KALDI_ERR << "Network has no output with name "
-                << request_.outputs[i].name;
-    for (int32 j = 0; j < request_.outputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request_.outputs[i].indexes[j]);
+                << request_->outputs[i].name;
+    for (int32 j = 0; j < request_->outputs[i].indexes.size(); j++) {
+      Cindex cindex(n, request_->outputs[i].indexes[j]);
       bool is_input = false, is_new;
       int32 cindex_id = graph_->GetCindexId(cindex, is_input, &is_new);
       KALDI_ASSERT(is_new && "Output index seems to be listed more than once");
@@ -328,7 +348,7 @@ void ComputationGraphBuilder::ExplainWhyAllOutputsNotComputable() const {
   KALDI_LOG << num_not_computable << " output cindexes out of "
             << num_outputs_total << " were not computable.";
   std::ostringstream os;
-  request_.Print(os);
+  request_->Print(os);
   KALDI_LOG << "Computation request was: " << os.str();
   if (num_not_computable > num_print)
     KALDI_LOG << "Printing the reasons for " << num_print << " of these.";
@@ -392,7 +412,7 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
       // in the set of inputs to the component that are computable.
       IndexSet index_set(*graph_, computable_info_, node_id - 1, dont_care);
       std::vector<Index> used_indexes;
-      bool ans = c->IsComputable(request_.misc_info, index, index_set,
+      bool ans = c->IsComputable(request_->misc_info, index, index_set,
                                  &used_indexes);
       // If the next assert fails it could be a failure in the assumption that
       // making more inputs available will never change something from not being
@@ -429,8 +449,24 @@ void ComputationGraphBuilder::PruneDependencies(int32 cindex_id) {
   dependencies.swap(used_cindex_ids);
 }
 
-void ComputationGraphBuilder::Compute() {
-  KALDI_ASSERT(current_distance_ == -1 && "Compute() called twice?");
+ComputationGraphBuilder::ComputationGraphBuilder(
+    const Nnet &nnet,
+    ComputationGraph *graph):
+    nnet_(nnet), request_(NULL), graph_(graph),
+    current_distance_(-1) {
+  KALDI_ASSERT(graph_->cindexes.empty() &&
+               "ComputationGraphBuilder initialized with nonempty graph.");
+}
+
+
+void ComputationGraphBuilder::Compute(const ComputationRequest &request) {
+  if (request_ != NULL && graph_->segment_ends.empty()) {
+    // this check is relevant to multi-segment (i.e. online) computations.
+    KALDI_ERR << "You are calling things in the wrong order: should be "
+              << "Compute(), Prune(), Compute, Prune(), ...";
+  }
+  int32 cur_segment_start = graph_->cindexes.size();
+  request_ = &request;
   AddInputs();
   AddOutputs();  // sets current_distance_ to 0.
   // max_distance for debugging, to detect infinite recursion.
@@ -439,7 +475,7 @@ void ComputationGraphBuilder::Compute() {
     BuildGraphOneIter();
     // only check rarely if we're running at low verbose level.
     if (GetVerboseLevel() >= 3 || RandInt(1,  (current_distance_ + 1)) == 1)
-      Check();
+      Check(cur_segment_start);
     // TODO: come up with a scheme to delay when we call
     // UpdateAllComputableInfo().
     UpdateAllComputableInfo();
@@ -449,13 +485,15 @@ void ComputationGraphBuilder::Compute() {
   if (current_distance_ == max_distance)
     KALDI_ERR << "Loop detected while building computation graph (bad "
               << "network topology?)";
-  Check();
+
+  if (RandInt(1, 2 * (graph_->segment_ends.size() + 1)) == 1)
+    Check(cur_segment_start);
 }
 
 
-void ComputationGraphBuilder::Check() const {
+void ComputationGraphBuilder::Check(int32 start_cindex_id) const {
   int32 num_cindex_ids = graph_->cindexes.size();
-  for (int32 cindex_id = 0; cindex_id < num_cindex_ids;
+  for (int32 cindex_id = start_cindex_id; cindex_id < num_cindex_ids;
        cindex_id += 1 + RandInt(0, num_cindex_ids / 100)) {
     { // check depend_on_this.
       std::vector<int32> depend_on_this = depend_on_this_[cindex_id];
@@ -476,12 +514,16 @@ void ComputationGraphBuilder::Check() const {
       KALDI_ASSERT(IsSortedAndUniq(dependencies));
       for (size_t j = 0; j < size; j++) {
         int32 dep_cindex_id = dependencies[j];
-        // make sure appears in appropriate depend_on_this_ array.
-        const std::vector<int32> &dep = depend_on_this_[dep_cindex_id];
-        KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
+        if (dep_cindex_id >= start_cindex_id) {
+          // make sure appears in appropriate depend_on_this_ array.
+          const std::vector<int32> &dep = depend_on_this_[dep_cindex_id];
+          KALDI_ASSERT(std::count(dep.begin(), dep.end(), cindex_id) == 1);
+        }
       }
     }
-    { // check usable_count_.
+
+    {
+      // check usable_count_
       int32 node_index = graph_->cindexes[cindex_id].first;
       int32 usable_count = usable_count_[cindex_id],
           usable_count_recomputed = nnet_.IsOutputNode(node_index) ? 1 : 0;
@@ -531,30 +573,52 @@ void ComputationGraphBuilder::Check() const {
 }
 
 void ComputationGraphBuilder::Prune() {
+  // Since Prune() is called for each segment in turn [note: there
+  // will be only 1 segment in the normal non-online case], we
+  // only prune for the current, just-added segment.
+  int32 start_cindex_id = (graph_->segment_ends.empty() ? 0 :
+                           graph_->segment_ends.back());
   int32 num_cindex_ids = graph_->cindexes.size();
   // Prune the dependencies to just those that are used (to remove
   // optional dependencies that don't end up getting used).
 
-  for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++)
+  for (int32 cindex_id = start_cindex_id;
+       cindex_id < num_cindex_ids; cindex_id++)
     PruneDependencies(cindex_id);
-  depend_on_this_.clear();  // not valid any more after pruning dependencies.
+  // the following clears the elements of depend_on_this from start_cindex_id to
+  // num_cindex_ids - 1, without touching the entire array.
+  depend_on_this_.resize(start_cindex_id);
+  depend_on_this_.resize(num_cindex_ids);
   std::vector<bool> required;
-  ComputeRequiredArray(&required);
+  ComputeRequiredArray(start_cindex_id, &required);
 
-  std::vector<bool> keep(num_cindex_ids, false);
-  for (int32 c = 0; c < num_cindex_ids; c++) {
-    if (required[c] || graph_->is_input[c]) {
+  std::vector<bool> keep(num_cindex_ids - start_cindex_id, false);
+  for (int32 c = start_cindex_id; c < num_cindex_ids; c++) {
+    if (required[c - start_cindex_id] || graph_->is_input[c]) {
       KALDI_ASSERT(computable_info_[c] == kComputable &&
                    "You are calling Prune when not everything is computable.");
-      keep[c] = true;
+      keep[c - start_cindex_id] = true;
     }
   }
-  graph_->Renumber(keep);
-  // The following variables will not be valid any more after the renumbering,
-  // so clear them.
-  computable_info_.clear();
-  computable_queue_.clear();
-  usable_count_.clear();
+  graph_->Renumber(start_cindex_id, keep);
+  // We also need to renumber computable_info_ and usable_count_, which
+  // graph_->Renumber doesn't do for us, but we can make some shortcuts.  We set
+  // all computable_info_ to kComputable because actually it all was kComputable
+  // (we checked when deciding what to keep); and we set the usable_count_ to 1
+  // for all the cindex_ids we just added...  this is not 100% accurate
+  // according to the way we defined usable_count_, but it prevents additional
+  // computation since it is > 0 (notice that IncrementUsableCount and
+  // DecrementUsableCount may do some work when the usable_count goes to zero or
+  // from zero.  Anyway, the usable-count for these cindex_ids for those "older
+  // segments" is not critical.  [this information only gets used if we process
+  // additional segments as part of the compilation of an online computation.]
+  int32 new_num_cindex_ids = graph_->cindexes.size();
+  computable_info_.resize(start_cindex_id);
+  computable_info_.resize(new_num_cindex_ids, (char)kComputable);
+  usable_count_.resize(start_cindex_id);
+  usable_count_.resize(new_num_cindex_ids, 1);
+  KALDI_ASSERT(computable_queue_.empty());
+  graph_->segment_ends.push_back(new_num_cindex_ids);
 }
 
 // Add cindex_ids that this cindex_id depends on.
@@ -584,7 +648,7 @@ void ComputationGraphBuilder::AddDependencies(int32 cindex_id) {
       int32 c = node.u.component_index;
       const Component *component = nnet_.GetComponent(c);
       std::vector<Index> input_indexes;
-      component->GetInputIndexes(request_.misc_info, index,
+      component->GetInputIndexes(request_->misc_info, index,
                                  &input_indexes);
       input_cindexes.resize(input_indexes.size());
       for (size_t i = 0; i < input_indexes.size(); i++) {
@@ -690,14 +754,14 @@ ComputationGraphBuilder::ComputeComputableInfo(int32 cindex_id)
       const int32 input_node_id = node_id - 1;
       {
         IndexSet index_set(*graph_, computable_info_, input_node_id, false);
-        if (c->IsComputable(request_.misc_info, index, index_set, NULL)) {
+        if (c->IsComputable(request_->misc_info, index, index_set, NULL)) {
           // it's computable even without counting kUnknown inputs as computable
           // [treat_unknown_as_computable = false] -> definitely computable.
           return kComputable;
         }
       }
       IndexSet index_set2(*graph_, computable_info_, input_node_id, true);
-      if (!c->IsComputable(request_.misc_info, index, index_set2, NULL)) {
+      if (!c->IsComputable(request_->misc_info, index, index_set2, NULL)) {
         // it's not computable even when counting kUnknown inputs as computable
         // [treat_unknown_as_computable = true] -> definitely not computable.
         return kNotComputable;
@@ -731,9 +795,9 @@ void ComputationGraphBuilder::GetComputableInfo(
   KALDI_ASSERT(!computable_info_.empty() &&
                "You need to call this before Prune()!");
   computable->clear();
-  computable->resize(request_.outputs.size());
-  for (size_t i = 0; i < request_.outputs.size(); i++) {
-    const IoSpecification &output = request_.outputs[i];
+  computable->resize(request_->outputs.size());
+  for (size_t i = 0; i < request_->outputs.size(); i++) {
+    const IoSpecification &output = request_->outputs[i];
     int32 n = nnet_.GetNodeIndex(output.name);
     KALDI_ASSERT(n != -1);
     int32 size = output.indexes.size();
@@ -861,19 +925,26 @@ void ComputationGraphBuilder::BuildGraphOneIter() {
 }
 
 void ComputationGraphBuilder::ComputeRequiredArray(
+    int32 start_cindex_id,
     std::vector<bool> *required) const {
 
   int32 num_cindex_ids = graph_->cindexes.size();
+  KALDI_ASSERT(num_cindex_ids >= start_cindex_id);
   KALDI_ASSERT(computable_info_.size() == num_cindex_ids);
   required->clear();
-  required->resize(num_cindex_ids, false);
+  required->resize(num_cindex_ids - start_cindex_id, false);
+
+  // would be bool, but indexing c++ bool may be slow.
+  std::vector<char> is_output_node(nnet_.NumNodes());
+  for (int32 n = 0; n < nnet_.NumNodes(); n++)
+    is_output_node[n] = (char)(nnet_.IsOutputNode(n) ? 1 : 0);
 
   std::vector<int32> queue;
-  for (int32 c = 0; c < num_cindex_ids; c++) {
+  for (int32 c = start_cindex_id; c < num_cindex_ids; c++) {
     // First put the output cindex_ids into the queue.
     int32 node_id = graph_->cindexes[c].first;
-    if (nnet_.IsOutputNode(node_id)) {
-      (*required)[c] = true;
+    if (is_output_node[node_id]) {
+      (*required)[c - start_cindex_id] = true;
       queue.push_back(c);
     }
   }
@@ -885,16 +956,17 @@ void ComputationGraphBuilder::ComputeRequiredArray(
         end = dependencies.end();
     for (; iter != end; ++iter) {
       int32 d = *iter;
-      if (!(*required)[d]){
-        (*required)[d] = true;
+      if (d >= start_cindex_id && !(*required)[d - start_cindex_id]){
+        (*required)[d - start_cindex_id] = true;
         queue.push_back(d);
       }
     }
   }
   // just check that we don't have any cindex_ids which are required but have
   // usable_count_ == 0; this would indicate a bug somewhere.
-  for (int32 c = 0; c < num_cindex_ids; c++)
-    KALDI_ASSERT(!((*required)[c] && (usable_count_[c] == 0)));
+  for (int32 c = start_cindex_id; c < num_cindex_ids; c++)
+    KALDI_ASSERT(!((*required)[c - start_cindex_id] &&
+                   (usable_count_[c] == 0)));
 
 }
 
@@ -956,27 +1028,27 @@ void AddInputToGraph(const ComputationRequest &request,
 /**
    This function outputs to dependencies_subset[c], for each cindex_id c,
    the subset of elements d of graph.dependencies[c] such that
-   cindex_id_to_epoch[d] == cindex_id_to_epoch[c].  That is, it's
+   cindex_id_to_segment_and_epoch[d] == cindex_id_to_segment_and_epoch[c].  That is, it's
    the dependency graph of the entire computation, but removing
-   links that go from one epoch to another epoch.  Topologically,
-   'dependencies_subset' would therefor consist of a bunch of
+   links that go from one segment/epoch to another segment/epoch.  Topologically,
+   'dependencies_subset' would therefore consist of a bunch of
    disconnected graphs.
 */
 static void ComputeDependenciesSubset(
     const ComputationGraph &graph,
-    const std::vector<int32> &cindex_id_to_epoch,
+    const std::vector<int32> &cindex_id_to_segment_and_epoch,
     std::vector<std::vector<int32> > *dependencies_subset) {
   int32 num_cindex_ids = graph.cindexes.size();
-  KALDI_ASSERT(cindex_id_to_epoch.size() == num_cindex_ids);
+  KALDI_ASSERT(cindex_id_to_segment_and_epoch.size() == num_cindex_ids);
   dependencies_subset->resize(num_cindex_ids);
   for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++) {
-    int32 phase_index = cindex_id_to_epoch[cindex_id];
+    int32 phase_index = cindex_id_to_segment_and_epoch[cindex_id];
     const std::vector<int32> &dependencies = graph.dependencies[cindex_id];
     std::vector<int32> &dep_subset = (*dependencies_subset)[cindex_id];
     int32 num_dep = dependencies.size();
     for (int32 i = 0; i < num_dep; i++) {
       int32 d = dependencies[i];
-      if (cindex_id_to_epoch[d] == phase_index)
+      if (cindex_id_to_segment_and_epoch[d] == phase_index)
         dep_subset.push_back(d);
     }
   }
@@ -1000,27 +1072,27 @@ static void ComputeDependenciesSubset(
 ///
 ///  \param nnet [in] The neural net
 ///  \param graph [in] The computation graph
-///  \param cindex_id_to_epoch [out] A vector that maps cindex_id to
-///            epoch index, as obtained by adding one to the output of
-///            ComputeNnetComputationOrder; however, input cindex_ids (those for
-///            which is_input[cindex_id] is true) always map to 0.
-///            Note: the epoch-index only depends on the neural network's
-///            topology of nodes; a node in the network should always map to
-///            the same epoch-index regardless of the computation, and
-///            we assign cindexes to epochs just based on what node the
-///            cindexes are part of.
-///  \param epochs [out] The same information as
-///            cindex_id_to_epoch, but in a different format: for each
-///            epoch, a list of cindex_ids with that epoch index.
-///  \param epoch_is_trivial [out] A vector of bool, indexed by
-///            epoch index that's true if this epoch index corresponds
-///            to just a single NetworkNode. (and also true for epoch index 0,
-///            which corresponds only to inputs to the network).
+///  \param cindex_id_to_segment_and_epoch [out] A vector that maps cindex_id to
+///          a number that is the same if two cindex_ids are in the same
+///          segment and same epoch, and different otherwise.  This
+///          number combines the segment index and the epoch index; the
+///          details are not important to the calling code.
+///  \param epochs_per_segment [out]  This is a listing of all the
+///           cindex_ids in the computation graph, divided up first
+///           by segment and then by epoch.
+///  \param epoch_is_trivial [out] A vector of bool, indexed by the epoch
+///           index which is the same as the second index of
+///           'epochs_per_segment', that's true if this epoch index corresponds
+///           to just a single NetworkNode (and also true for epoch indexes
+///           corresponding to inputs to the network, which will be the first
+///           epoch of each segment).  This depends on the neural network
+///           structure only.
+
 static void ComputeEpochInfo(
     const Nnet &nnet,
     const ComputationGraph &graph,
-    std::vector<int32> *cindex_id_to_epoch,
-    std::vector<std::vector<int32 > > *epochs,
+    std::vector<int32> *cindex_id_to_segment_and_epoch,
+    std::vector<std::vector<std::vector<int32 > > > *epochs_per_segment,
     std::vector<bool> *epoch_is_trivial) {
 
   // node_to_epoch maps each nnet node to an index >= 0 that tells us coarsely
@@ -1041,10 +1113,14 @@ static void ComputeEpochInfo(
     node_to_epoch[i]++;
   int32 num_nodes = nnet.NumNodes(),
       num_cindex_ids = graph.cindexes.size(),
+      num_segments = graph.segment_ends.size(),
       num_epoch_indexes = 1 + *std::max_element(node_to_epoch.begin(),
                                                 node_to_epoch.end());
   KALDI_ASSERT(node_to_epoch.size() == num_nodes);
 
+  epochs_per_segment->clear();
+  epochs_per_segment->resize(num_segments);
+
   // epoch_to_num_nodes is only used so we know whether each epoch
   // index corresponds to multiple nodes; if it's just one node then we know
   // the computation is very simple and we can do an optimization.
@@ -1057,15 +1133,24 @@ static void ComputeEpochInfo(
     KALDI_ASSERT(o == 0 || epoch_to_num_nodes[o] > 0);
     (*epoch_is_trivial)[o] = (epoch_to_num_nodes[o] <= 1);
   }
-
-  cindex_id_to_epoch->resize(num_cindex_ids);
-  epochs->resize(num_epoch_indexes);
-  for (int32 cindex_id = 0; cindex_id < num_cindex_ids; cindex_id++) {
-    int32 node_index = graph.cindexes[cindex_id].first,
-        epoch_index = (graph.is_input[cindex_id] ? 0 :
-                             node_to_epoch[node_index]);
-    (*cindex_id_to_epoch)[cindex_id] = epoch_index;
-    (*epochs)[epoch_index].push_back(cindex_id);
+  cindex_id_to_segment_and_epoch->resize(num_cindex_ids);
+  KALDI_ASSERT(graph.segment_ends.back() == num_cindex_ids);
+  int32 cur_segment_start = 0, cur_segment_end;
+  for (int32 segment = 0; segment < num_segments; segment++) {
+    cur_segment_end = graph.segment_ends[segment];
+    std::vector<std::vector<int32> > &epochs = (*epochs_per_segment)[segment];
+    epochs.resize(num_epoch_indexes);
+
+    for (int32 cindex_id = cur_segment_start;
+         cindex_id < cur_segment_end; cindex_id++) {
+      int32 node_index = graph.cindexes[cindex_id].first,
+          epoch_index = (graph.is_input[cindex_id] ? 0 :
+                         node_to_epoch[node_index]);
+      (*cindex_id_to_segment_and_epoch)[cindex_id] =
+          epoch_index + segment * num_epoch_indexes;
+      epochs[epoch_index].push_back(cindex_id);
+    }
+    cur_segment_start = cur_segment_end;
   }
 }
 
@@ -1168,6 +1253,14 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
   return ans;
 }
 
+static int32 SumVectorSizes(const std::vector<std::vector<std::vector<int32> > > &vec) {
+  int32 ans = 0;
+  for (size_t i = 0; i < vec.size(); i++)
+    ans += SumVectorSizes(vec[i]);
+  return ans;
+}
+
+
 /*
   this function is called from ComputeComputationPhases; it handles the part of
   the computation from one epoch (this code was broken out to avoid that
@@ -1187,10 +1280,11 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
                           in things like TDNNs.
   @param [in] dependencies_subset  A subset of 'graph.dependencies' corresponding
                           just to dependencies within the same epoch (not specifically
-                          this epoch; for all epochs).  E.g. for a cindex_id c
+                          this epoch; for all epochs).  In general, for a cindex_id c
                           dependencies[c] is a list of other cindex_ids d1, d2,
                           such that in order to compute c we must first compute
-                          d1, d2 and so on.
+                          d1, d2 and so on (plus d1, d2, etc. must be from the
+                          same epoch as c).
   @param [in] depends_on_subset  The graph-transpose of dependencies_subset;
                           for cindex_id c, depends_on_subset[c] is the list
                           of cindex_ids that directly depend on cindex_id c,
@@ -1198,26 +1292,26 @@ static int32 SumVectorSizes(const std::vector<std::vector<int32> > &vec) {
   @param [in] epoch_is_trivial  A bool that's true if this epoch is trivial
                           (meaning it consists of just one component)... this
                           enables a faster code path in this common case.
-  @param [in,out] phase_indexes  This vector, to some elements of which this function writes
-                          each time it is called, maps from cindex_id to the
-                          'phase index'.  A phase index is a number identifying
-                          the phases [like coarse steps] of the computation, with
-                          zero for the first phase, one for the second, etc.
-                          We work out how many phase indexes have been used already
-                          by previous epochs, from phases->size().  Actually,
-                          phase_indexes is really just a temporary variable used
-                          by this function, that we allocate outside this
-                          function for efficiency.  It is initialized to
-                          -1 outside this function; different invocations of
-                          this function work with different elements of the
-                          vector.
-  @param [in,out] phases  This is the output of this function.  Each time
-                          we add a new phase, we append a vector to *phases.
-                          E.g. (*phases)[0] is the sorted list of cindexes
-                          in the first phase of the computation... and so on.
-                          Note, this function is called multiple times, and
-                          each time we add one or more phases to this vector,
-                          so its size grows.
+  @param [in,out] phase_indexes  This vector, to some elements of which this
+                          function writes each time it is called, maps from
+                          cindex_id to the 'phase index'.  A phase index is a
+                          number identifying the phases [like coarse steps] of
+                          the computation, with zero for the first phase, one
+                          for the second, etc.  We work out how many phase
+                          indexes have been used already by previous epochs,
+                          from phases->size().  Actually, phase_indexes is
+                          really just a temporary variable used by this
+                          function, that we allocate outside this function for
+                          efficiency.  It is initialized to -1 outside this
+                          function; different invocations of this function work
+                          with different non-overlapping elements of the vector.
+                          @param [in,out] phases This is the output of this
+                          function.  Each time we add a new phase, we append a
+                          vector to *phases.  E.g. (*phases)[0] is the sorted
+                          list of cindexes in the first phase of the
+                          computation... and so on.  Note, this function is
+                          called multiple times, and each time we add one or
+                          more phases to this vector, so its size grows.
 */
 static inline void ComputeComputationPhasesForEpoch(
     const Nnet &nnet,
@@ -1321,17 +1415,17 @@ static inline void ComputeComputationPhasesForEpoch(
 void ComputeComputationPhases(
     const Nnet &nnet,
     const ComputationGraph &graph,
-    std::vector<std::vector<int32> > *phases) {
+    std::vector<std::vector<std::vector<int32> > > *phases_per_segment) {
   using namespace computation_graph;
   int32 num_cindex_ids = graph.cindexes.size();
 
-  std::vector<int32> cindex_id_to_epoch;
-  std::vector<std::vector<int32 > > epochs;
+  std::vector<int32> cindex_id_to_segment_and_epoch;
+  std::vector<std::vector<std::vector<int32 > > > epochs_per_segment;
   std::vector<bool> epoch_is_trivial;
-  ComputeEpochInfo(nnet, graph, &cindex_id_to_epoch,
-                   &epochs, &epoch_is_trivial);
+  ComputeEpochInfo(nnet, graph, &cindex_id_to_segment_and_epoch,
+                   &epochs_per_segment, &epoch_is_trivial);
 
-  KALDI_ASSERT(SumVectorSizes(epochs) == num_cindex_ids);
+  KALDI_ASSERT(SumVectorSizes(epochs_per_segment) == num_cindex_ids);
 
   // dependencies_subset contains just the subset of dependencies
   // of each cindex_id, that have the same epoch index as
@@ -1339,8 +1433,10 @@ void ComputeComputationPhases(
   // cindexes within a certain epoch (relevant for things like
   // LSTMs).
   std::vector<std::vector<int32> > dependencies_subset;
-  ComputeDependenciesSubset(graph, cindex_id_to_epoch,
+  ComputeDependenciesSubset(graph, cindex_id_to_segment_and_epoch,
                             &dependencies_subset);
+  // destroy cindex_id_to_segment_and_epoch, it's no longer needed.
+  { std::vector<int32> temp; temp.swap(cindex_id_to_segment_and_epoch);  }
 
   // depend_on_subset is a subset of the normal "depend_on" list (i.e. a list of
   // all cindex_ids that depend on the current cindex_id), limited to just those
@@ -1348,31 +1444,32 @@ void ComputeComputationPhases(
   std::vector<std::vector<int32> > depend_on_subset;
   ComputeGraphTranspose(dependencies_subset, &depend_on_subset);
 
-  int32 num_epoch_indexes = epoch_is_trivial.size();
+  int32 num_epoch_indexes = epoch_is_trivial.size(),
+      num_segments = graph.segment_ends.size();
 
   // "phase_indexes" is used inside ComputeComputationPhasesForEpoch.
   std::vector<int32> phase_indexes(num_cindex_ids, -1);
 
-  if (phases) {
-    phases->clear();
-    phases->reserve(50);  // minimize unnecessary copies.  50 is very
-                            // arbitrarily chosen.
+  phases_per_segment->clear();
+  phases_per_segment->resize(num_segments);
+
+  for (int32 segment = 0; segment < num_segments; segment++) {
+    phases_per_segment->reserve(50);  // minimize unnecessary copies.  50 is
+                                      // very arbitrarily chosen.
+    for (int32 epoch = 0; epoch < num_epoch_indexes; epoch++)
+      ComputeComputationPhasesForEpoch(nnet, graph,
+                                       epochs_per_segment[segment][epoch],
+                                       dependencies_subset,
+                                       depend_on_subset,
+                                       epoch_is_trivial[epoch],
+                                       &phase_indexes,
+                                       &((*phases_per_segment)[segment]));
   }
 
-  for (int32 epoch = 0;
-       epoch < num_epoch_indexes;
-       epoch++)
-    ComputeComputationPhasesForEpoch(nnet, graph,
-                                     epochs[epoch],
-                                     dependencies_subset,
-                                     depend_on_subset,
-                                     epoch_is_trivial[epoch],
-                                     &phase_indexes, phases);
-
 
   // make sure everything was computable.  If the next assert fails it's likely
   // a bug in this function or in PruneComputataionGraph.
-  KALDI_ASSERT(SumVectorSizes(*phases) == num_cindex_ids);
+  KALDI_ASSERT(SumVectorSizes(*phases_per_segment) == num_cindex_ids);
 }
 
 CindexSet::CindexSet(const ComputationGraph &graph):
@@ -1429,434 +1526,461 @@ bool IndexSet::operator () (const Index &index) const {
 }
 
 
+ComputationStepsComputer::ComputationStepsComputer(
+    const Nnet &nnet,
+    ComputationGraph *graph,
+    std::vector<std::vector<int32> > *steps,
+    std::vector<std::pair<int32, int32> > *locations):
+    nnet_(nnet), graph_(graph), steps_(steps), locations_(locations) {
+  steps_->clear();
+  locations_->clear();
+  int32 num_cindexes = graph_->cindexes.size();
+  // leave a little space in case a few cindexes are added (unlikely
+  // but could happen with dim-range nodes).
+  locations_->reserve(num_cindexes + num_cindexes / 10);
+  locations_->resize(num_cindexes, std::pair<int32,int32>(-1, -1));
+}
 
-namespace compute_computation_steps {
-// namespace for some helper functions for ComputeComputationSteps.
-
-/// Adds a "step" for each of the inputs in the ComputationRequest.
-/// Does this in the same order in which they were declared in
-/// the request (this order won't matter at all).
-/// returns the total number of cindex_ids that correspond to inputs.
-int32 AddInputSteps(const Nnet &nnet,
-                    const ComputationRequest &request,
-                    const ComputationGraph &graph,
-                    std::vector<std::vector<int32> > *steps) {
-  KALDI_ASSERT(steps->empty());
-  steps->reserve(50);  // will minimize unnecessary copies of vectors.
-  unordered_set<int32> all_nodes;  // to make sure nothing is listed twice.
-  int32 num_cindex_ids = 0;
-  for (int32 i = 0; i < request.inputs.size(); i++) {
-    int32 n = nnet.GetNodeIndex(request.inputs[i].name);
-    if (n == -1)
-      KALDI_ERR << "Network has no output with name "
-                << request.inputs[i].name;
-    // ensure no input node is listed twice.
-    KALDI_ASSERT(all_nodes.count(n) == 0 && "Invalid computation request: "
-                 "double listing of node.");
-    all_nodes.insert(n);
-    KALDI_ASSERT(!request.inputs[i].indexes.empty() &&
-                 "Computation request had no indexes for input ");
-    steps->push_back(std::vector<int32>());
-    std::vector<int32> &this_step = steps->back();
-    this_step.resize(request.inputs[i].indexes.size());
-    for (int32 j = 0; j < request.inputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request.inputs[i].indexes[j]);
-      int32 cindex_id = graph.GetCindexId(cindex);
-      KALDI_ASSERT(cindex_id != -1);  // would be code error.
-      this_step[j] = cindex_id;
+void ComputationStepsComputer::ComputeForSegment(
+    const ComputationRequest &request,
+    const std::vector<std::vector<int32> > &phases) {
+  int32 this_num_phases = phases.size();
+  for (int32 i = 0; i < this_num_phases; i++) {
+    std::vector<std::vector<Cindex> > sub_phases;
+    SplitIntoSubPhases(phases[i], &sub_phases);
+    for (size_t j = 0; j < sub_phases.size(); j++) {
+      ProcessSubPhase(request, sub_phases[j]);
     }
-    num_cindex_ids += request.inputs[i].indexes.size();
   }
-  return num_cindex_ids;
 }
 
+void ComputationStepsComputer::ProcessInputOrOutputStep(
+    const ComputationRequest &request,
+    bool is_output,
+    const std::vector<Cindex> &sub_phase) {
+  int32 io_node = sub_phase[0].first;
+  if (is_output){
+    KALDI_ASSERT(nnet_.IsOutputNode(io_node));
+  } else {
+    KALDI_ASSERT(nnet_.IsInputNode(io_node));
+  }
+  std::string node_name = nnet_.GetNodeName(io_node);
+  const std::vector<IoSpecification> &inputs_or_outputs =
+      (is_output ? request.outputs : request.inputs);
+  int32 io_index = -1;
+  for (size_t i = 0; i < inputs_or_outputs.size(); i++)
+    if (inputs_or_outputs[i].name == node_name)
+      io_index = i;
+  KALDI_ASSERT(io_index >= 0);
+  const std::vector<Index> &io_indexes = inputs_or_outputs[io_index].indexes;
+  std::vector<Cindex> io_cindexes(io_indexes.size());
+  for (size_t i = 0, size = io_cindexes.size(); i < size; i++) {
+    io_cindexes[i].first = io_node;
+    io_cindexes[i].second = io_indexes[i];
+  }
+  KALDI_ASSERT(io_cindexes.size() == sub_phase.size());
+  // we expect the list of cindexes in 'io_cindexes' to be identical to
+  // that in 'sub_phase' (but they don't have to be in the same order)... for now we check the size, we'll spot-check
+  // that they are the same later.
+  // The actual output in 'steps' must be in the same order as
+  int32 step_index = AddStep(io_cindexes);
+  // Now spot-check that the cindexes in 'sub_phase' are the same as those
+  // we just added.  [note: they don't have to be in the same order, but
+  // they should be the same set.]
+  for (size_t i = 0; i < sub_phase.size(); i += 10) {
+    const Cindex &cindex = sub_phase[i];
+    int32 cindex_id = graph_->GetCindexId(cindex);
+    KALDI_ASSERT(cindex_id >= 0 && (*locations_)[cindex_id].first == step_index);
+  }
+}
 
-/// Adds a "step" for each of the outputs in the ComputationRequest.  This will
-/// be done after adding steps for all the inputs and then all the
-/// non(input/output)s.  Does this in the same order in which they were declared
-/// in the request (this won't matter at all).
-void AddOutputSteps(const Nnet &nnet,
-                    const ComputationRequest &request,
-                    const ComputationGraph &graph,
-                    std::vector<std::vector<int32> > *steps) {
-  std::set<int32> all_nodes;  // to make sure nothing listed twice.
-  for (int32 i = 0; i < request.outputs.size(); i++) {
-    int32 n = nnet.GetNodeIndex(request.outputs[i].name);
-    if (n == -1)
-      KALDI_ERR << "Network has no output with name "
-                << request.outputs[i].name;
-    // ensure no output node is listed twice.
-    KALDI_ASSERT(all_nodes.count(n) == 0 && "Invalid computation request: "
-                 "double listing of node.");
-    all_nodes.insert(n);
-    KALDI_ASSERT(!request.outputs[i].indexes.empty() &&
-                 "Computation request had no indexes for output ");
-    steps->push_back(std::vector<int32>());
-    std::vector<int32> &this_step = steps->back();
-    this_step.resize(request.outputs[i].indexes.size());
-    for (int32 j = 0; j < request.outputs[i].indexes.size(); j++) {
-      Cindex cindex(n, request.outputs[i].indexes[j]);
-      int32 cindex_id = graph.GetCindexId(cindex);
-      KALDI_ASSERT(cindex_id != -1);  // would be code error.
-      this_step[j] = cindex_id;
+int32 ComputationStepsComputer::AddStep(const std::vector<Cindex> &cindexes,
+                                        bool add_if_absent) {
+  // note: we can't assert that cindexes is nonempty, because it's possible for
+  // input steps for GeneralComponents to be empty if they require no input
+  // indexes; and because the compiler code expects component steps to be
+  // preceded by component-input steps, we can't just omit these empty steps.
+  // [note: a component-input step is about preparing the input for a component's
+  // propagation.]
+  int32 step_index = steps_->size();
+  steps_->push_back(std::vector<int32>());
+  std::vector<int32> &step = steps_->back();  // vector of cindex_id.
+  step.resize(cindexes.size());
+  size_t row_index = 0;
+  std::vector<Cindex>::const_iterator iter = cindexes.begin(),
+      end = cindexes.end();
+  std::vector<int32>::iterator out_iter = step.begin();
+  std::pair<int32, int32> *locations = &((*locations_)[0]);
+  if (!add_if_absent) {
+    // this version of GetCindexId will not add CindexIds.
+    for (; iter != end; ++iter, ++out_iter, ++row_index) {
+      int32 cindex_id = graph_->GetCindexId(*iter);
+      *out_iter = cindex_id;
+      locations[cindex_id].first = step_index;
+      locations[cindex_id].second = row_index;
     }
+  } else {
+    for (; iter != end; ++iter, ++out_iter, ++row_index) {
+      bool is_input = false;  // only relevant if we have to add the cindex to
+                              // the computation graph, which we won't for
+                              // inputs (we only might for dim-range nodes).
+      bool added;
+      int32 cindex_id = graph_->GetCindexId(*iter, is_input, &added);
+      *out_iter = cindex_id;
+      if (added) {
+        KALDI_ASSERT(cindex_id == static_cast<int32>(locations_->size()));
+        locations_->resize(cindex_id + 1);
+        locations_->back().first = step_index;
+        locations_->back().second = row_index;
+        locations = &((*locations_)[0]);  // in case it was reallocated
+      } else {
+        locations[cindex_id].first = step_index;
+        locations[cindex_id].second = row_index;
+      }
+    }
+  }
+  return step_index;
+}
+
+
+int32 ComputationStepsComputer::AddStep(std::vector<int32> *cindex_ids) {
+  int32 step_index = steps_->size();
+  steps_->push_back(std::vector<int32>());
+  steps_->back().swap(*cindex_ids);
+  std::vector<int32>::const_iterator iter = steps_->back().begin(),
+      end = steps_->back().end();
+  int32 row_index = 0;
+  std::pair<int32,int32> *locations = &((*locations_)[0]);
+  size_t num_cindexes = graph_->cindexes.size();
+  for (; iter != end; ++iter, ++row_index) {
+    int32 cindex_id = *iter;
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < num_cindexes);
+    locations[cindex_id].first = step_index;
+    locations[cindex_id].second = row_index;
   }
+  return step_index;
 }
 
-/// Convert the cindex_ids in the vector "cindex_ids" to cindexes, but only
-/// keeping those that correspond to nodes of type kComponent.
-/// Asserts that none of these cindexes have the "is_input" set to true.
-/// [this is possible because we call this only for phases >1, and inputs
-/// should not be there.]
-static void ExtractOnlyComponentCindexes(const std::vector<int32> &cindex_ids,
-                                         const ComputationGraph &graph,
-                                         const Nnet &nnet,
-                                         std::vector<Cindex> *cindexes) {
-  cindexes->clear();
-  cindexes->reserve(cindex_ids.size());
+
+void ComputationStepsComputer::ConvertToCindexes(
+    const std::vector<int32> &cindex_ids,
+    std::vector<Cindex> *cindexes) const {
+  cindexes->resize(cindex_ids.size());
+  size_t num_cindexes = graph_->cindexes.size();
   std::vector<int32>::const_iterator iter = cindex_ids.begin(),
-                                      end = cindex_ids.end();
-  for (; iter != end; ++iter) {
+      end = cindex_ids.end();
+  std::vector<Cindex>::iterator out_iter = cindexes->begin();
+  for (; iter != end; ++iter, ++out_iter) {
     int32 cindex_id = *iter;
-    const Cindex &cindex = graph.cindexes[cindex_id];
-    if (nnet.IsComponentNode(cindex.first)) {
-      KALDI_ASSERT(!graph.is_input[cindex_id]);
-      cindexes->push_back(cindex);
-    }
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < num_cindexes);
+    *out_iter = graph_->cindexes[cindex_id];
   }
 }
 
-/// Outputs into component_steps, steps corresponding to all Cindexes that
-/// correspond to Component nodes and that are not inputs to the network.  (note
-/// that a Cindex for a Component node that's provided as an input to the
-/// network is not case we anticipate being common, but it's possible in the
-/// framework).  Note, a step is just a list of cindex_ids that can all be computed
-/// at the same time.
-static void AddComponentSteps(
-    const Nnet &nnet,
-    const ComputationGraph &graph,
-    const std::vector<std::vector<int32> > &phases,
-    std::vector<std::vector<int32> > *component_steps) {
-  int32 num_phase_indexes = phases.size();
-
-  std::vector<Cindex> cindexes;
-
-  // We don't include phase_index = 0, because all inputs to the network
-  // (whether the node index is type kInput or kComponent) will be assigned to
-  // phase_index 0, and no non-inputs should be there (we checked this).
-  for (int32 phase_index = 1; phase_index < num_phase_indexes; phase_index++) {
-    ExtractOnlyComponentCindexes(phases[phase_index], graph, nnet, &cindexes);
-
-    // now "cindexes" contains all Cindexes that are from Component nodes (and
-    // we have made sure that none of these are being provided as inputs).
-    // Sorting this array gives us the ordering we want, where Cindexes from
-    // different node-ids are separated into contiguous ranges, and within each
-    // range, they are sorted by Index.
-    std::sort(cindexes.begin(), cindexes.end());
-
-    std::vector<Cindex>::iterator iter = cindexes.begin(), end = cindexes.end();
-    while (iter != end) {
-      // each pass through this while loop processes one batch of cindex_ids;
-      // each batch has a particular node-index.
-      std::vector<Cindex>::iterator cur_end = iter;
-      int32 this_node_id = iter->first;
-      while (cur_end != end && cur_end->first == this_node_id)
-        cur_end++;
-      // the range [iter, cur_end) is nonempty and contains all the same node-id.
-      int32 size = cur_end - iter;
-      component_steps->push_back(std::vector<int32>());
-      std::vector<int32> &this_step = component_steps->back();
-      this_step.resize(size);
-      for (int32 i = 0; i < size; i++, iter++)
-        this_step[i] = graph.GetCindexId(*iter);
-      KALDI_ASSERT(iter == cur_end);
-      // at this point iter will point to either the end of the "cindexes"
-      // vector, or the beginning of the next set of Cindexes to process.
-    }
+
+void ComputationStepsComputer::ConvertToCindexIds(
+    const std::vector<Cindex> &cindexes,
+    std::vector<int32> *cindex_ids) const {
+  cindex_ids->resize(cindexes.size());
+  std::vector<Cindex>::const_iterator iter = cindexes.begin(),
+      end = cindexes.end();
+  std::vector<int32>::iterator out_iter = cindex_ids->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    int32 cindex_id = graph_->GetCindexId(*iter);
+    KALDI_ASSERT(cindex_id >= 0);
+    *out_iter = cindex_id;
   }
 }
 
 
-/// You call this function after calling AddInputSteps to add steps for inputs
-/// to "all_steps", then calling AddComponentSteps to output steps for
-/// components to "component_steps".  This function moves the component steps
-/// from "component_steps" to "all_steps", while preceding each component step
-/// with a corresponding step for setting up the input to that component (i.e. a
-/// step for the preceding Descriptor).  The reason we do it like this is (a) to
-/// ensure that the step for the input to the Component, which comes from a
-/// Descriptor, comes immediately before it, which is convenient; and (b)
-/// because it's possible in certain rather weird setups, some Cindexes
-/// corresponding to the Descriptors at the inputs of Components will end up
-/// being listed in two separate steps; and if we added the input-descriptor
-/// steps using the same mechanism as AddComponentSteps, we wouldn't be able to
-/// correctly capture this duplication.
-static void AddComponentInputSteps(
-    const ComputationGraph &graph,
-    std::vector<std::vector<int32> > *component_steps,
-    std::vector<std::vector<int32> > *all_steps) {
+// static
+void ComputationStepsComputer::ConvertToIndexes(
+    const std::vector<Cindex> &cindexes,
+    std::vector<Index> *indexes) {
+  indexes->resize(cindexes.size());
+  std::vector<Cindex>::const_iterator iter = cindexes.begin(),
+      end = cindexes.end();
+  std::vector<Index>::iterator out_iter = indexes->begin();
+  for (; iter != end; ++iter, ++out_iter)
+    *out_iter = iter->second;
+}
+
+// static
+void ComputationStepsComputer::ConvertToCindexes(
+    const std::vector<Index> &indexes,
+    int32 node_index,
+    std::vector<Cindex> *cindexes) {
+  KALDI_ASSERT(node_index >= 0);
+  cindexes->resize(indexes.size());
+  std::vector<Index>::const_iterator iter = indexes.begin(),
+      end = indexes.end();
+  std::vector<Cindex>::iterator out_iter = cindexes->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    out_iter->first = node_index;
+    out_iter->second = *iter;
+  }
+}
 
-  int32 space_for_outputs = 10;  // arbitrary.
-  all_steps->reserve(all_steps->size() +
-                     component_steps->size() * 2 + space_for_outputs);
 
 
-  for (size_t i = 0; i < component_steps->size(); i++) {
-    std::vector<int32> &component_step = (*component_steps)[i];
-    KALDI_ASSERT(!component_step.empty());
-    // First make a step for the descriptor at the input of this Component.
-    unordered_set<int32> descriptor_cindex_ids;
-    std::vector<int32>::iterator iter = component_step.begin(),
-        end = component_step.end();
+
+void ComputationStepsComputer::ProcessComponentStep(
+    const std::vector<Cindex> &step) {
+  KALDI_ASSERT(!step.empty());
+  int32 component_node_index = step.front().first;
+  int32 component_input_index = component_node_index - 1;
+  KALDI_ASSERT(nnet_.IsComponentNode(component_node_index));
+  const NetworkNode &node = nnet_.GetNode(component_node_index);
+  int32 c = node.u.component_index;
+  const Component *component = nnet_.GetComponent(c);
+  if (component->Properties() & kSimpleComponent) {
+    // for simple components, the input cindexes will be the same as the
+    // output ones except for the node index, so we do a shortcut that's
+    // faster (no following dependencies).
+    std::vector<Cindex> input_step(step.size());
+    input_step.resize(step.size());
+    std::vector<Cindex>::iterator iter = input_step.begin(),
+        end = input_step.end();
+    std::vector<Cindex>::const_iterator src = step.begin();
+    for (; iter != end; ++iter,++src) {
+      iter->first = component_input_index;
+      iter->second = src->second;
+    }
+    AddStep(input_step);
+    AddStep(step);
+  } else {
+    std::vector<int32> step_cindex_ids;
+    ConvertToCindexIds(step, &step_cindex_ids);
+    // to get the input cindexes we need to follow dependencies back.
+    unordered_set<int32> input_cindex_ids;
+    std::vector<int32>::iterator iter = step_cindex_ids.begin(),
+        end = step_cindex_ids.end();
     for (; iter != end; ++iter) {
       int32 c = *iter;
-      const std::vector<int32> &dependencies = graph.dependencies[c];
+      const std::vector<int32> &dependencies = graph_->dependencies[c];
       std::vector<int32>::const_iterator dep_iter = dependencies.begin(),
           dep_end = dependencies.end();
       for (; dep_iter != dep_end; ++dep_iter) {
         int32 d = *dep_iter;
-        descriptor_cindex_ids.insert(d);
+        input_cindex_ids.insert(d);
       }
     }
     // Convert to Cindexes so we can sort them as Cindexes.
-    std::vector<Cindex> descriptor_cindexes;
-    descriptor_cindexes.reserve(descriptor_cindex_ids.size());
-    unordered_set<int32>::iterator set_iter = descriptor_cindex_ids.begin(),
-        set_end = descriptor_cindex_ids.end();
+    std::vector<Cindex> input_step;
+    input_step.reserve(input_cindex_ids.size());
+    unordered_set<int32>::iterator set_iter = input_cindex_ids.begin(),
+        set_end = input_cindex_ids.end();
     for (; set_iter != set_end; ++set_iter) {
       int32 c = *set_iter;
-      descriptor_cindexes.push_back(graph.cindexes[c]);
-    }
-    // sort the cindexes.
-    std::sort(descriptor_cindexes.begin(), descriptor_cindexes.end());
-
-    // We technically allow a Component with no input, e.g. in case where for
-    // some reason it decides it has no dependencies, e.g. it has a constant
-    // output.  In this case we create an empty step, to preserve the property
-    // that the step for the Component's input comes immediately before the step
-    // for the Component itself.
-    if (!descriptor_cindexes.empty()) {
-      // Make sure all these cindexes come from the same node_id, which should
-      // be the one immediately preceding the Component node_id of
-      // "component_step".
-      int32 node_id = descriptor_cindexes.front().first;
-      KALDI_ASSERT(descriptor_cindexes.back().first == node_id &&
-                   graph.cindexes[component_step.front()].first == node_id + 1);
+      input_step.push_back(graph_->cindexes[c]);
     }
-    // Now that we've sorted, convert back to cindex_ids (this list will be
-    // the "step").
-    int32 size = descriptor_cindexes.size();
-    std::vector<int32> descriptor_step(size);
-    for (int32 i = 0; i < size; i++) {
-      descriptor_step[i] = graph.GetCindexId(descriptor_cindexes[i]);
-      KALDI_ASSERT(descriptor_step[i] != -1);
-    }
-    // efficiently add descriptor_step to the end of all_steps.
-    all_steps->push_back(std::vector<int32>());
-    all_steps->back().swap(descriptor_step);
 
-    // efficiently add component_step to the end of all_steps (this destroys the
-    // input, which we won't be needing any more).
-    all_steps->push_back(std::vector<int32>());
-    all_steps->back().swap(component_step);
+    // sort the input cindexes.
+    std::sort(input_step.begin(), input_step.end());
+
+    if (component->Properties() & kReordersIndexes) {
+      std::vector<Index> indexes, input_indexes;
+      ConvertToIndexes(input_step, &input_indexes);
+      ConvertToIndexes(step, &indexes);
+
+      // the component wants to have the opportunity to change the
+      // order of these indexes from their default.
+      component->ReorderIndexes(&input_indexes, &indexes);
+
+      // Now convert back from indexes to cindexes (we know the
+      // node-index in each case)
+      std::vector<Cindex> reordered_step;
+      ConvertToCindexes(indexes, component_node_index, &reordered_step);
+      ConvertToCindexes(input_indexes, component_input_index, &input_step);
+      AddStep(input_step);
+      AddStep(reordered_step);
+    } else {
+      AddStep(input_step);
+      // it's more efficient to add the step with cindex_ids; and we have these
+      // available, so we do it that way.  (in the other branch where
+      // the flag kReordersIndexes was present, we couldn't do this because
+      // of the reordering).
+      AddStep(&step_cindex_ids);
+    }
   }
-  component_steps->clear();
 }
 
 
-static void CreateCindexIdToStep(
-    const ComputationGraph &graph,
-    const std::vector<std::vector<int32> > &all_steps,
-    std::vector<int32> *cindex_id_to_step) {
-  int32 num_cindex_ids = graph.cindexes.size();
-  cindex_id_to_step->clear();
-  cindex_id_to_step->resize(num_cindex_ids, -1);
-  int32 num_steps = all_steps.size();
-  for (int32 step = 0; step < num_steps; step++) {
-    std::vector<int32>::const_iterator iter = all_steps[step].begin(),
-        end = all_steps[step].end();
-    for (; iter != end; ++iter) {
-      int32 cindex_id = *iter;
-      (*cindex_id_to_step)[cindex_id] = step;
-    }
+void ComputationStepsComputer::ConvertToLocations(
+    const std::vector<int32> &cindex_ids,
+    std::vector<std::pair<int32, int32> > *locations) const {
+  locations->resize(cindex_ids.size());
+  std::vector<int32>::const_iterator iter = cindex_ids.begin(),
+      end = cindex_ids.end();
+  std::vector<std::pair<int32, int32> >::iterator out_iter =
+      locations->begin();
+  // note, locations_ and locations are different variables.
+  std::pair<int32, int32> *locations_ptr = &((*locations_)[0]);
+  size_t num_cindexes = locations_->size();
+  for (; iter != end; ++iter, ++out_iter) {
+    int32 cindex_id = *iter;
+    KALDI_ASSERT(static_cast<size_t>(cindex_id) < num_cindexes);
+    int32 step = locations_ptr[cindex_id].first,
+        row = locations_ptr[cindex_id].second;
+    KALDI_ASSERT(step >= 0);
+    out_iter->first = step;
+    out_iter->second = row;
   }
 }
 
-/// This function inserts into "all_steps", which at this point should contain
-/// all but the output steps, steps corresponding to any nodes of type kDimRange.
-/// "graph" is non-const as there are situations in which we might need to
-/// add cindexes for nodes of type kDimRange.
-static void AddDimRangeSteps(
-    const Nnet &nnet,
-    ComputationGraph *graph,
-    std::vector<std::vector<int32> > *all_steps) {
-  int32 num_nodes = nnet.NumNodes();
-  bool dim_range_node_exists = false;
-  std::vector<char> is_dim_range_node(num_nodes, '\0');
-  for (int32 n = 0; n < num_nodes; n++) {
-    if (nnet.IsDimRangeNode(n)) {
-      is_dim_range_node[n] = (char)1;
-      dim_range_node_exists = true;
+void ComputationStepsComputer::ProcessDimRangeSubPhase(
+    const std::vector<Cindex> &sub_phase) {
+  int32 dim_range_node = sub_phase[0].first;
+  KALDI_ASSERT(nnet_.IsDimRangeNode(dim_range_node));
+  const NetworkNode &node = nnet_.GetNode(dim_range_node);
+  // 'input_node_index' is the node index of the component or input node
+  // that this dim-range node gets its input from.
+  int32 input_node_index = node.u.node_index;
+  // input_cindexes will give us the cindexes of the component or input node
+  // that is the input to this dim-range node
+  std::vector<Cindex> input_cindexes(sub_phase);
+  for (std::vector<Cindex>::iterator iter = input_cindexes.begin(),
+           end = input_cindexes.end(); iter != end; ++iter)
+    iter->first = input_node_index;
+  std::vector<int32> input_cindex_ids;
+  ConvertToCindexIds(input_cindexes, &input_cindex_ids);
+  std::vector<std::pair<int32, int32> > locations;
+  ConvertToLocations(input_cindex_ids, &locations);
+  std::sort(locations.begin(), locations.end());
+  KALDI_ASSERT(!locations.empty());
+  std::vector<std::pair<int32, int32> >::const_iterator
+      locations_iter = locations.begin(),
+      locations_end = locations.end();
+  // Each unique .first number in locations (i.e. each source step, and they
+  // will all correspond to component-output or input steps) will generate one
+  // 'step' of type kDimRange.  Because dim-range nodes must be contiguous
+  // ranges of a source step (since they are represented as sub-matrices), for
+  // each source step we work out the first and last row-index (i.e. first and
+  // last .second member of locations) and use that to reconstruct the range.
+
+  // each element of 'steps' will be (source_step, (begin_row, end_row)) so that
+  // the source of the dim-range node is indexes begin_row ... end_row-1 in that
+  // source step.
+  std::vector<std::pair<int32, std::pair<int32, int32> > > steps;
+
+  int32 cur_source_step = locations_iter->first,
+      cur_row_begin = locations_iter->second,
+      cur_row_end = cur_row_begin + 1;
+  while (1) {
+    ++locations_iter;
+    if (locations_iter == locations_end ||
+        locations_iter->first != cur_source_step) {
+      // we reached the end of a run of the same step.
+      std::pair<int32, std::pair<int32, int32> > this_step;
+      this_step.first = cur_source_step;
+      this_step.second.first = cur_row_begin;
+      this_step.second.second = cur_row_end;
+      steps.push_back(this_step);
+      if (locations_iter != locations_end) {
+        cur_source_step = locations_iter->first;
+        cur_row_begin = locations_iter->second;
+        cur_row_end = cur_row_begin + 1;
+      } else {
+        break;
+      }
+    } else {
+      cur_row_end = locations_iter->second + 1;
     }
   }
-  if (!dim_range_node_exists)
-    return;
 
-  std::vector<int32> cindex_id_to_step;
-  CreateCindexIdToStep(*graph, *all_steps, &cindex_id_to_step);
-  int32 num_steps = all_steps->size();
-
-  // We are going to insert steps for nodes of type kDimRange just after the
-  // kInput or kComponent steps that the kDimRange nodes refer to.
-  // new_nodes_per_step will be a list of any nodes of type kDimRange that
-  // have input corresponding to something in that step.
-  std::vector<std::set<int32> > new_nodes_per_step(num_steps);
-  int32 num_cindex_ids = graph->cindexes.size();
-  std::vector<Cindex>::const_iterator iter = graph->cindexes.begin();
-  for (int32 i = 0; i < num_cindex_ids; i++,iter++) {
-    const Cindex &cindex = *iter;
-    int32 node_index = cindex.first;
-    if (!is_dim_range_node[node_index])
-      continue;
-    const NetworkNode &node = nnet.GetNode(node_index);
-    Cindex input_cindex(node.u.node_index, cindex.second);
-    int32 input_cindex_id = graph->GetCindexId(input_cindex);
-    KALDI_ASSERT(input_cindex_id != -1);
-    int32 input_step = cindex_id_to_step[input_cindex_id];
-    KALDI_ASSERT(input_step != -1);
-    new_nodes_per_step[input_step].insert(node_index);
-  }
-  int32 num_new_steps = 0, space_for_output = 10;
-  for (int32 step = 0; step < num_steps; step++)
-    num_new_steps += new_nodes_per_step[step].size();
-
-  // we'll later swap all_steps_out with all_steps.
-  std::vector<std::vector<int32> > all_steps_out;
-  all_steps_out.reserve(num_steps + num_new_steps + space_for_output);
-  for (int32 step = 0; step < num_steps; step++) {
-    std::vector<int32> &this_step = (*all_steps)[step];
-    int32 cur_out_index = all_steps_out.size();
-    all_steps_out.push_back(std::vector<int32>());  // make space for this step.
-    std::set<int32>::iterator iter = new_nodes_per_step[step].begin(),
-        end = new_nodes_per_step[step].end();
-    for (; iter != end; ++iter) {
-      int32 node = *iter, size = this_step.size();
-      std::vector<int32> new_step(size);
-      for (int32 i = 0; i < size; i++) {
-        int32 cindex_id = this_step[i];
-        Cindex dimrange_cindex(node, graph->cindexes[cindex_id].second);
-        bool input = false, is_new;
-        int32 dimrange_cindex_id = graph->GetCindexId(dimrange_cindex,
-                                                      input, &is_new);
-        new_step[i] = dimrange_cindex_id;
-        if (is_new) {  // if we newly added this cindex_id, note the dependency
-                       // on its input.
-          graph->dependencies[dimrange_cindex_id].push_back(cindex_id);
-        }
-      }
-      all_steps_out.push_back(std::vector<int32>());
-      all_steps_out.back().swap(new_step);
-    }
-    all_steps_out[cur_out_index].swap(this_step);
+  for (size_t i = 0; i < steps.size(); i++) {
+    // iterating over different source steps, although normally
+    // there will be just one.
+    int32 source_step = steps[i].first,
+        row_begin = steps[i].second.first,
+        row_end = steps[i].second.second;
+    // 'source' is just the elements of the source step that we're consuming.
+    std::vector<int32> source((*steps_)[source_step].begin() + row_begin,
+                              (*steps_)[source_step].begin() + row_end);
+    std::vector<Cindex> cindexes;
+    ConvertToCindexes(source, &cindexes);
+    std::vector<Cindex>::iterator iter = cindexes.begin(),
+        end = cindexes.end();
+    for (; iter != end; ++iter)
+      iter->first = dim_range_node;
+    bool add_if_absent = true;
+    // this add_if_absent says, even if cindexes were not in the graph,
+    // add them.  This is possible in principle; it's to satisfy the
+    // requirement that DimRangeNodes be implemented as contiguous ranges
+    // of rows of component nodes or input nodes.
+    AddStep(cindexes, add_if_absent);
   }
-  all_steps->swap(all_steps_out);
 }
 
+void ComputationStepsComputer::ProcessSubPhase(
+    const ComputationRequest &request,
+    const std::vector<Cindex> &sub_phase) {
+  KALDI_ASSERT(!sub_phase.empty());
+  int32 node_index = sub_phase[0].first;
+  KALDI_ASSERT(sub_phase.back().first == node_index);
+  if (nnet_.IsComponentNode(node_index)) {
+    ProcessComponentStep(sub_phase);
+  } else if (nnet_.IsInputNode(node_index)) {
+    ProcessInputOrOutputStep(request, false, sub_phase);
+  } else if (nnet_.IsOutputNode(node_index)) {
+    ProcessInputOrOutputStep(request, true, sub_phase);
+  } else if (nnet_.IsDimRangeNode(node_index)) {
+    // this might turn out to be multiple steps, see the code.
+    ProcessDimRangeSubPhase(sub_phase);
+  } else if (nnet_.IsComponentInputNode(node_index)) {
+    // We actually do nothing with these sub-phases, because they are processed
+    // when we process the associated component's sub-phase/step.  Doing it this
+    // way resolves certain problems.
+    return;
+  } else {
+    KALDI_ERR << "Unknown node type.";
+  }
+}
 
 
-/// This function would not be necessary if we had not added the ReorderIndexes
-/// function to class Component.  It is responsible for possibly modifying the
-/// order of the inputs and outputs of non-simple Components, and also possibly
-/// removing some inputs if the Component has decided it doesn't need them.  It
-/// may be a while before this is ever used for something.  An example use is
-/// that maybe in convolutional nets or simple models, some components may want,
-/// efficiency or convenience, a certain ordering of the input that differs from
-/// the normal order.
-void ReorderIndexes(const Nnet &nnet,
-                    const ComputationRequest &request,
-                    const ComputationGraph &graph,
-                    std::vector<std::vector<int32> > *steps) {
-
-  for (int32 step = 0; step < steps->size(); step++) {
-    std::vector<int32> &cindex_ids = (*steps)[step];
-    if (cindex_ids.empty()) continue;
-    int32 cindex_id = cindex_ids.front();
-    int32 node_index = graph.cindexes[cindex_id].first;
-    const NetworkNode &node = nnet.GetNode(node_index);
-    if (node.node_type != kComponent ||
-        graph.is_input[cindex_id])
-      continue;  // nothing to do if an input, or if not a Component.
-
-    int32 c = node.u.component_index;
-    const Component *component = nnet.GetComponent(c);
-    if (!(component->Properties() & kReordersIndexes))
-      continue;  // nothing to do if it doesn't modify indexes.
-    KALDI_ASSERT(step > 0);  // or should have continued already.
-
-    // preceding step will be Cindexes from the input Descriptor.
-    std::vector<int32> &input_cindex_ids = (*steps)[step - 1];
-
-    int32 size = cindex_ids.size(), input_size = input_cindex_ids.size();
-    std::vector<Index> indexes(size), input_indexes(input_size);
-
-    for (int32 i = 0; i < size; i++)
-      indexes[i] = graph.cindexes[cindex_ids[i]].second;
-    for (int32 i = 0; i < input_size; i++)
-      input_indexes[i] = graph.cindexes[input_cindex_ids[i]].second;
-
-    component->ReorderIndexes(&input_indexes, &indexes);
-    // size should not change.
-    KALDI_ASSERT(input_indexes.size() == input_size && indexes.size() == size);
-
-    if (size > 0) {
-      int32 node_index = graph.cindexes[cindex_ids.front()].first;
-      for (int32 i = 0; i < size; i++) {
-        Cindex cindex(node_index, indexes[i]);
-        cindex_ids[i] = graph.GetCindexId(cindex);
-      }
-    }
-    if (input_size > 0) {
-      int32 input_node_index = graph.cindexes[input_cindex_ids.front()].first;
-      for (int32 i = 0; i < input_size; i++) {
-        Cindex cindex(input_node_index, input_indexes[i]);
-        input_cindex_ids[i] = graph.GetCindexId(cindex);
-      }
-    }
-    // note: cindex_ids and input_cindex_ids are references, so we have
-    // changed *steps by writing to them in the above two loops.
+void ComputationStepsComputer::Check() const {
+  int32 num_cindexes = graph_->cindexes.size();
+  KALDI_ASSERT(locations_->size() == num_cindexes);
+  for (int32 c = 0; c < num_cindexes; c++) {
+    int32 step = (*locations_)[c].first,
+        row = (*locations_)[c].second;
+    KALDI_ASSERT(step >= 0 && row >= 0 &&
+                 (*steps_)[step][row] == c);
   }
 }
 
-} // namespace compute_computation_steps.
-
-void ComputeComputationSteps(
-    const Nnet &nnet,
-    const ComputationRequest &request,
-    const std::vector<std::vector<int32> > &phases,
-    ComputationGraph *graph,
-    std::vector<std::vector<int32> > *steps) {
-  using namespace compute_computation_steps;
-  steps->clear();
-  AddInputSteps(nnet, request, *graph, steps);
-  {
-    std::vector<std::vector<int32> > component_steps;
-    AddComponentSteps(nnet, *graph, phases, &component_steps);
-    AddComponentInputSteps(*graph, &component_steps, steps);
-  }
-  // output steps don't get reordered so we do the reordering before adding
-  // them.
-  ReorderIndexes(nnet, request, *graph, steps);
-  AddDimRangeSteps(nnet, graph, steps);
-  AddOutputSteps(nnet, request, *graph, steps);
-
-  int32 num_cindexes = 0;
-  for (int32 i = 0; i < steps->size(); i++)
-    num_cindexes += (*steps)[i].size();
-  // The next line has ">=" not "==" because it is possible (although unlikely
-  // in normal setups) that some cindexes of Descriptors which are at the inputs
-  // of Components,
-  KALDI_ASSERT(num_cindexes >= graph->cindexes.size());
+void ComputationStepsComputer::SplitIntoSubPhases(
+    const std::vector<int32> &phase,
+    std::vector<std::vector<Cindex> > *sub_phases) const {
+  std::vector<Cindex> phase_cindexes;
+  ConvertToCindexes(phase, &phase_cindexes);
+  KALDI_ASSERT(!phase_cindexes.empty());
+  std::sort(phase_cindexes.begin(), phase_cindexes.end());
+  // 'sub_phase_begins' is the indexes onto 'phase_cindees' that
+  // start a run of the same node-index
+  std::vector<size_t> segment_begins;
+  int32 cur_node_index = -1;
+  size_t size = phase_cindexes.size();
+  for (size_t i = 0; i < size; i++) {
+    if (phase_cindexes[i].first != cur_node_index) {
+      cur_node_index = phase_cindexes[i].first;
+      segment_begins.push_back(i);
+    }
+  }
+  size_t num_sub_phases = segment_begins.size();
+  segment_begins.push_back(size);
+  sub_phases->clear();
+  sub_phases->resize(num_sub_phases);
+  for (size_t i = 0; i < num_sub_phases; i++) {
+    size_t this_begin = segment_begins[i],
+        this_end = segment_begins[i+1];
+    (*sub_phases)[i].insert((*sub_phases)[i].end(),
+                            phase_cindexes.begin() + this_begin,
+                            phase_cindexes.begin() + this_end);
+  }
 }
 
 
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-computation-graph.h b/src/nnet3/nnet-computation-graph.h
index 660e20d36ad..863add7fd2d 100644
--- a/src/nnet3/nnet-computation-graph.h
+++ b/src/nnet3/nnet-computation-graph.h
@@ -62,19 +62,39 @@ struct ComputationGraph {
   /// those that are used (which will vary depending on availability).
   std::vector<std::vector<int32> > dependencies;
 
+  /// This variable is only of particular interest in a 'multi-segment'
+  /// computation, which is used while creating computations for 'online'
+  /// operation (for the kind of situation where you provide some input; run the
+  /// computation; get some output, provide some more input for larger 't'
+  /// values, etc.).  In this context, a 'segment' is a continuous range of
+  /// cindex_ids, and a segment_end is one past the end of each segment, which
+  /// is the same as the beginning of the next segment, if there is one.  In the
+  /// case of a fully-created computation graph with only one segment, this will
+  /// contain just one value which equals the number of cindex_ids.
+  /// This information is needed to correctly order the computation, because
+  ///
+  /// the computation graph itself does not contain dependencies that encode the
+  /// ordering of segments (and even if it did contain those dependencies, it's
+  /// not really compatible with the way we use the scc's in the graph structure
+  /// of the network to order the computation).
+  std::vector<int32> segment_ends;
+
   /// Maps a Cindex to an integer cindex_id.  If not present, then add it (with
   /// the corresponding "is_input" flag set to the value "input") and set
   /// *is_new to true.  If present, set is_new to false and return the existing
   /// cindex_id.
-  int32 GetCindexId(const Cindex &cindex, bool input, bool *is_new);
+  int32 GetCindexId(const Cindex &cindex, bool is_input, bool *is_new);
 
   /// Const version of GetCindexId that does not add CindexIds.  It will return
   /// -1 if the Cindex is not present, and the user should check for this.
   int32 GetCindexId(const Cindex &cindex) const;
 
-  /// This function renumbers the cindex-ids, keeping only for which keep[c] is
-  /// true.  The "keep" array must be the same size as this->cindexes.
-  void Renumber(const std::vector<bool> &keep);
+  /// This function renumbers the cindex-ids (but only those with index c >= start_cindex_id,
+  // keeping only for which keep[c - start_cindex_id] is
+  /// true.  The "keep" array must be the same size as this->cindexes.size() -
+  /// start_cindex_id.
+  void Renumber(int32 start_cindex_id,
+                const std::vector<bool> &keep);
 
 
   /// This function, useful for debugging/visualization purposes,
@@ -97,13 +117,18 @@ struct ComputationGraph {
 class ComputationGraphBuilder {
  public:
   ComputationGraphBuilder(const Nnet &nnet,
-                          const ComputationRequest &request,
-                          ComputationGraph *graph):
-      nnet_(nnet), request_(request), graph_(graph), current_distance_(-1) { }
-
-  // Does the initial computation (populating the graph and computing
-  // whether each required cindex_id is computable), without the pruning.
-  void Compute();
+                          ComputationGraph *graph);
+
+  // Does the initial computation (populating the graph and computing whether
+  // each required cindex_id is computable), without the pruning.  In the normal
+  // case you call this just once with one 'request', but in the 'online' case
+  // you call Compute() [then maybe check AllOutputsAreComputable()] then
+  // Prune() multiple times, with a sequence of different requests for
+  // increasing time values.
+  // Note: it sets the class member request_ to the address of 'request', so
+  // you should not let 'request' go out of scope while this class might
+  // still use it (e.g. until you call Compute() with a different
+  void Compute(const ComputationRequest &request);
 
   // Returns true if all requested outputs are computable.  To be called after
   // Compute() but before Prune(().
@@ -211,14 +236,20 @@ class ComputationGraphBuilder {
   // PruneDependencies() to remove unused dependencies, so it will only say
   // something is required if it is really accessed in the computation.
   // We'll later use this to remove unnecessary cindexes.
-  void ComputeRequiredArray(std::vector<bool> *required) const;
+  // 'start_cindex_id' is the cindex_id from which the 'required' array is
+  // to start (normally zero, but may be nonzero in multi-segment computations);
+  // so 'required' is indexed by cindex_id - start_cindex_id.
+  void ComputeRequiredArray(int32 start_cindex_id,
+                            std::vector<bool> *required) const;
 
   // this function, to be called from Compute(), does some sanity checks to
-  // verify that the internal state is consistent.
-  void Check() const;
+  // verify that the internal state is consistent.  It only does this for the
+  // current 'segment' of the computation, starting from 'start_cindex_id' (this
+  // will be 0 in normal, single-segment computations).
+  void Check(int32 start_cindex_id) const;
 
   const Nnet &nnet_;
-  const ComputationRequest &request_;
+  const ComputationRequest *request_;
   ComputationGraph *graph_;
 
   // this is the transpose of graph_->dependencies; it tells us
@@ -248,7 +279,7 @@ class ComputationGraphBuilder {
   std::vector<int32> usable_count_;
 
   // current_distance_ >= 0 is the distance to the output, of the cindex_ids in
-  // current_queue_;
+  // current_queue_.
   int32 current_distance_;
   // the cindex_ids in current_queue_ are at distance "current_distance" to the
   // output and have not yet had their dependencies processed.
@@ -322,27 +353,33 @@ class IndexSet {
 
    @param [in] nnet  The neural network this computation is for
    @param [in] graph  The computation graph that we're computing phases for.
-   @param [out] phases  The phases.  Suppose the computation can be completed
-                       in 20 phases, then phases->size() will be 20 at exit, and
-                       (*phases)[0] will be a sorted list of cindex_ids.  that
-                       belong to the first phase, and so on. (Remember, a
-                       cindex_id is an index into graph->cindexes; it compactly
-                       identifies a cindex.)  The sets represented by the
-                       elements of 'phases' will be disjoint and will cover all
-                       elements in [0 .. computation.cindexes.size() - 1].
-
-                       This function will be crash if the computation cannot
-                       actualy be computed.  Note: we assume you have called
-                       PruneComputationGraph() before this function.
+   @param [out] phases_per_segment  The phases, listed separately for each
+                segment of the computation [there will be just one segment in
+                the normal case, more in the online-recognition case].  Consider
+                just one segment for now.  Suppose the computation can be
+                completed in 20 phases, then (*phases)[0].size() will be 20 at
+                exit, and (*phases)[0][0] will be a sorted list of cindex_ids.
+                that belong to the first phase, and so on. (Remember, a
+                cindex_id is an index into graph->cindexes; it compactly
+                identifies a cindex.)  The sets represented by the int32's in
+                'phases_per_segment' will be disjoint and will cover all
+                elements in [0 .. computation.cindexes.size() - 1].
+
+                Note: we assume you have called PruneComputationGraph() before
+                this function.  Even so, this function will be crash if the
+                computation cannot actually be computed-- there are some
+                mal-formed computations where you can build the computation graph
+                but not the ordering of cindexes because there are dependencies
+                forward and backward in time that intertwine.
 */
 void ComputeComputationPhases(
     const Nnet &nnet,
     const ComputationGraph &computation_graph,
-    std::vector<std::vector<int32> > *phases);
+    std::vector<std::vector<std::vector<int32> > > *phases_per_segment);
 
 
 /**
-   This function arranges the cindex_ids of the computation into a sequence of
+   This class arranges the cindex_ids of the computation into a sequence of
    lists called "steps", which will correspond roughly to the commands in the
    compiled computation.  The steps are finer than phases.  (See \ref
    dnn3_compile_steps for more info).  To summarize the properties that
@@ -351,30 +388,157 @@ void ComputeComputationPhases(
   - All cindex_ids within a given step correspond to the same node in the graph
   - All dependencies of cindex_ids within a given step have been computed in
     earlier steps.
-  .
-There are also some extra, more obscure properties that the sequence of steps
-must satisfy:
-  - Any input or output in the ComputationRequest must be in one step, with the
-    Indexes in the same order as specified in the ComputationRequest.  (Note:
-    inputs can be for nodes of type kComponent as well as kInput).
+  - All cindex_ids within a given step share the same location when
+    computed (i.e. a matrix or submatix)
+
+ There are also some extra, more obscure properties that the sequence of steps
+ must satisfy:
+
+  - Any input or output specified in a ComputationRequest must be in one step,
+    with the Indexes in the same order as specified in the ComputationRequest.
+    (Note: inputs can be for nodes of type kComponent as well as kInput).
   - If a step corresponds to a node of type kComponent (and does not
     correspond to an input in the ComputationRequest), then the immediately
     preceding step must correspond to a node of type kDescriptor, and the
     sequence of Indexes in the two steps must be identical.
   - If a step corresponds to a node of type kDimRange, then there must be
-    another step corresponding to the source node, with exactly the same
+    a preceding step corresponding to the source node, with exactly the same
     Indexes appearing in the same order.  (This lets us use a sub-matrix for
-    the kDimRange node).
+    the kDimRange node).  We guarantee this by adding extra cindexes to the
+    kDimRange steps as needed.
 
-The reason why computation_graph is not provided as a const argument is
-that in order to ensure the final property we may have to add a few new cindex_ids.
+ The reason why computation_graph is not provided as a const argument is that in
+ order to ensure the final property we may have to add a few new cindex_ids.
 */
-void ComputeComputationSteps(
-    const Nnet &nnet,
-    const ComputationRequest &request,
-    const std::vector<std::vector<int32> > &phases,
-    ComputationGraph *computation_graph,
-    std::vector<std::vector<int32> > *steps);
+
+class ComputationStepsComputer {
+ public:
+  /// Constructor.
+  ///  @param [in] nnet        The neural network that this computation is for.
+  ///  @param [in,out]  graph  The computation graph that we're computing the steps
+  ///                          for.  It's only non-const because in certain
+  ///                          unusual cases relating to nodes of type kDimRange,
+  ///                          we may need to add new cindexes.
+  ///  @param [out] steps     The main output of this class, which is
+  ///                         a sequence of steps, each step being an ordered list of cindex_ids.
+  ///                         It just gets cleared in the constructor; it's set up
+  ///                         when you call ComputeForSegment().
+  ///  @param [out] locations The additional output of this class, which is a function
+  ///                         of the information in 'steps'.  The array
+  ///                         'locations' is indexed by cindex_id, and each one is a pair
+  ///                         (step-index, index-into-step), so that for any cindex_id c,
+  ///                         (*steps)[locations[c].first][locations[c].second] == c.
+  ///                          It's possible in principle if there are non-simple
+  ///                          Components, that for node corresponding to component-input
+  ///                          descriptors, a cindex might be present in more than one step,
+  ///                          so it doesn't follow that if (*steps)[i][j] == c, then
+  ///                          locations[c] == (i,j).
+  ComputationStepsComputer(const Nnet &nnet,
+                           ComputationGraph *graph,
+                           std::vector<std::vector<int32> > *steps,
+                           std::vector<std::pair<int32, int32> > *locations);
+
+  /// You call this once for each segment, in order (note: for normal,
+  /// non-online computations, there is only one segment).
+  void ComputeForSegment(const ComputationRequest &request,
+                         const std::vector<std::vector<int32> > &phases);
+
+  /// This is only to be called after you have called ComputeForSegment
+  /// for all the segments.
+  void Check() const;
+ private:
+
+  // Adds step(s) for one "sub-phase".  A sub-phase is the set of cindex_ids from
+  // one phase that have the same node index.  Note: for nodes that are
+  // component-input descriptors, we don't actually create the step here, we
+  // create it just before creating the step for its component, and we recreate
+  // the list of cindexes from those from the component.  The reason is that
+  // there are situations where doing it directly from the raw_step would not do
+  // the right thing (especially with non-simple components, it's possible that
+  // the cindexes component-input descriptors could be used twice by two
+  // different components)..
+  void ProcessSubPhase(const ComputationRequest &request,
+                       const std::vector<Cindex> &sub_phase);
+
+  // Called from ProcessSubPhase- for the case where it's a DimRangeNode.
+  void ProcessDimRangeSubPhase(const std::vector<Cindex> &sub_phase);
+
+  // Called from ProcessSubPhase- for the case where it's an input or output node.
+  void ProcessInputOrOutputStep(const ComputationRequest &request,
+                                bool is_output,
+                                const std::vector<Cindex> &sub_phase);
+
+  // Called from ProcessSubPhase- for the case where it's a component node.
+  void ProcessComponentStep(const std::vector<Cindex> &step);
+
+
+  // Splits a phase up into multiple "sub-phases", which are just the cindexes
+  // from a phase that are from a single node, sorted.  At this point we
+  // represent them as Cindexes, not cindex_ids.  For efficiency and because it
+  // would be discarded anyway, it discards any raw steps that correspond to
+  // component-input descriptors because these are not processed inside
+  // ProcessSubPhase().
+  void SplitIntoSubPhases(const std::vector<int32> &phase,
+                          std::vector<std::vector<Cindex> > *sub_phase) const;
+
+  // This low-level function used by functions like ProcessComponentStep,
+  // ProcessInputStep and so on, adds one step to 'steps_' (converting from
+  // Cindex to cindex_ids), and updates 'locations' appropriately.  It returns
+  // the step index that we just added (== size of steps_ at entry).
+  // If you specify add_if_absent = true, it will add any Cindexes that were
+  // not already present, to the graph.  [this option is only to be used
+  // in processing dim-range nodes.
+  int32 AddStep(const std::vector<Cindex> &cindexes,
+                bool add_if_absent = false);
+
+  // This is an alternative interface to AddStep() that takes a list of
+  // cindex_ids instead of cindexes (it's destructive of that list).
+  int32 AddStep(std::vector<int32> *cindex_ids);
+
+
+  // This utility function uses graph_ to convert a vector of cindex_ids into
+  // Cindexes.
+  void ConvertToCindexes(const std::vector<int32> &cindex_ids,
+                         std::vector<Cindex> *cindexes) const;
+
+  // Converts a vector of Cindexes to a vector of Indexes, by
+  // stripping out the node index.
+  static void ConvertToIndexes(const std::vector<Cindex> &cindexes,
+                               std::vector<Index> *indexes);
+
+  // Converts a vector of Indexes to Cindexes, using a supplied
+  // node index.
+  static void ConvertToCindexes(const std::vector<Index> &indexes,
+                                int32 node_index,
+                                std::vector<Cindex> *cindexes);
+
+
+  // This utility function uses graph_ to convert a vector of cindex_ids into
+  // Cindexes.   It will crash if the cindexes were not present in the graph.
+  void ConvertToCindexIds(const std::vector<Cindex> &cindexes,
+                          std::vector<int32> *cindex_ids) const;
+
+  // This utility function uses the 'locations_' array to convert the cindex_ids
+  // in 'cindex_ids' into an array (of the same length) of locations, i.e. of
+  // pairs (step, index-into-step), so that if cindex_ids[i] = c, then
+  // (*locations)[i] will be set to (*locations_)[c].  It will die if
+  // one of the locations was not defined, i.e. was the pair (-1, -1).
+  void ConvertToLocations(
+      const std::vector<int32> &cindex_ids,
+      std::vector<std::pair<int32, int32> > *locations) const;
+
+
+  const Nnet &nnet_;
+  ComputationGraph *graph_;
+  /// steps_ is a pointer to an output that's passed in in the constructor.
+  std::vector<std::vector<int32> > *steps_;
+  /// locations_ is a map from cindex_id to the pair of indexes into steps_ where
+  /// that cindex_id resides, so if (*locations_)[c] = (i,j), then
+  /// (*steps_)[i][j] == c.  This is also an output (we get the pointer in
+  /// the constructor).
+  std::vector<std::pair<int32, int32> > *locations_;
+};
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index ba56f5080e8..ec1214279ff 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1,7 +1,5 @@
 // nnet3/nnet-computation.cc
 
-// nnet3/nnet-computation.cc
-
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
 //                2015  Xiaohui Zhang
 
@@ -77,8 +75,9 @@ int32 ComputationRequest::IndexForOutput(
 }
 
 NnetComputation::~NnetComputation() {
-  for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
-    delete component_precomputed_indexes[i];
+  // note: component_precomputed_indexes[0].data is the NULL pointer.
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i].data;
 }
 
 void NnetComputation::ComputeCudaIndexes() {
@@ -279,10 +278,18 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) {
       command_type = kAddToRowsMulti;
     } else if (command_type_str == "kAddRowRanges") {
       command_type = kAddRowRanges;
+    } else if (command_type_str == "kAcceptInput") {
+      command_type = kAcceptInput;
+    } else if (command_type_str == "kProvideOutput") {
+      command_type = kProvideOutput;
     } else if (command_type_str == "kNoOperation") {
       command_type = kNoOperation;
     } else if (command_type_str == "kNoOperationMarker") {
       command_type = kNoOperationMarker;
+    } else if (command_type_str == "kNoOperationLabel") {
+      command_type = kNoOperationLabel;
+    } else if (command_type_str == "kGotoLabel") {
+      command_type = kGotoLabel;
     } else {
       KALDI_ERR << "Un-handled command type.";
     }
@@ -364,12 +371,24 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
       case kAddRowRanges:
         os << "kAddRowRanges\n";
         break;
+      case kAcceptInput:
+        os << "kAcceptInput\n";
+        break;
+      case kProvideOutput:
+        os << "kProvideOutput\n";
+        break;
       case kNoOperation:
         os << "kNoOperation\n";
         break;
       case kNoOperationMarker:
         os << "kNoOperationMarker\n";
         break;
+      case kNoOperationLabel:
+        os << "kNoOperationLabel\n";
+        break;
+      case kGotoLabel:
+        os << "kGotoLabel\n";
+        break;
       default:
         KALDI_ERR << "Un-handled command type.";
     }
@@ -494,28 +513,30 @@ static void PrintCommand(std::ostream &os,
   const NnetComputation::Command &c = computation.commands[command_index];
   switch (c.command_type) {
     case kAllocMatrixZeroed:
-      os << "m" << c.arg1 << " = zeros("
-         << computation.matrices[c.arg1].num_rows
-         << ',' << computation.matrices[c.arg1].num_cols << ")\n";
+      os << submatrix_strings[c.arg1] << " = zeros("
+         << computation.submatrices[c.arg1].num_rows
+         << ',' << computation.submatrices[c.arg1].num_cols << ")\n";
       break;
     case kAllocMatrixUndefined:
-      os << "m" << c.arg1 << " = undefined("
-         << computation.matrices[c.arg1].num_rows
-         << ',' << computation.matrices[c.arg1].num_cols << ")\n";
+      os << submatrix_strings[c.arg1] << " = undefined("
+         << computation.submatrices[c.arg1].num_rows
+         << ',' << computation.submatrices[c.arg1].num_cols << ")\n";
       break;
     case kDeallocMatrix:
-      os << "m" << c.arg1 << " = []\n";
+      os << submatrix_strings[c.arg1] << " = []\n";
       break;
     case kAllocMatrixFromOther:
-      os << "m" << c.arg1 << ".swap(m" << c.arg2 << ") [dim = "
-         << computation.matrices[c.arg1].num_rows << " x "
-         << computation.matrices[c.arg1].num_cols << "]\n";
+      os << submatrix_strings[c.arg1] << ".swap("
+         << submatrix_strings[c.arg2] << ") [dim = "
+         << computation.submatrices[c.arg1].num_rows << " x "
+         << computation.submatrices[c.arg1].num_cols << "]\n";
       break;
     case kAllocMatrixFromOtherZeroed:
-      os << "m" << c.arg1 << ".swap(m" << c.arg2 << ") [dim = "
-         << computation.matrices[c.arg1].num_rows << " x "
-         << computation.matrices[c.arg1].num_cols << "]; m"
-         << c.arg1 << ".zero();\n";
+      os << submatrix_strings[c.arg1] << ".swap("
+         << submatrix_strings[c.arg2] << ") [dim = "
+         << computation.submatrices[c.arg1].num_rows << " x "
+         << computation.submatrices[c.arg1].num_cols << "]; "
+         << submatrix_strings[c.arg1] << ".zero();\n";
       break;
     case kPropagate:
       os << nnet.GetComponentName(c.arg1) << ".Propagate(";
@@ -584,11 +605,25 @@ static void PrintCommand(std::ostream &os,
       os << "])\n";
       break;
     }
+    case kAcceptInput:
+      os << submatrix_strings[c.arg1] << " = user input [for node: '"
+         << nnet.GetNodeName(c.arg2) << "']\n";
+      break;
+    case kProvideOutput:
+      os << "output " << submatrix_strings[c.arg1] << " to user"
+         << " [for node: '" << nnet.GetNodeName(c.arg2) << "']\n";
+      break;
     case kNoOperation:
       os << "[no-op]\n";
       break;
     case kNoOperationMarker:
-      os << "# begin backward commands\n";
+      os << "# computation segment separator [e.g., begin backward commands]\n";
+      break;
+    case kNoOperationLabel:
+      os << "[label for goto statement]\n";
+      break;
+    case kGotoLabel:
+      os << "goto c" << c.arg1 << "\n";
       break;
     default:
       KALDI_ERR << "Un-handled command type.";
@@ -613,20 +648,6 @@ static void PrintComputationPreamble(
       os << ", ";
   }
   os << "\n";
-  // show which matrices the inputs and outputs map to.
-  for (unordered_map<int32, std::pair<int32, int32> >::const_iterator iter =
-           c.input_output_info.begin(); iter != c.input_output_info.end();
-       ++iter) {
-    int32 node_index = iter->first,
-        value_matrix_index = iter->second.first,
-        deriv_matrix_index = iter->second.second;
-    os << nnet.GetNodeName(node_index) << ".value -> m"
-       << value_matrix_index << "\n";
-    if (deriv_matrix_index != 0) {
-      os << nnet.GetNodeName(node_index) << ".deriv -> m"
-         << deriv_matrix_index << "\n";
-    }
-  }
   if (!c.matrix_debug_info.empty()) {
     os << "# The following show how matrices correspond to network-nodes and\n"
        << "# cindex-ids.  Format is: matrix = <node-id>.[value|deriv][ <list-of-cindex-ids> ]\n"
@@ -659,9 +680,25 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const {
 }
 
 void NnetComputation::Read(std::istream &is, bool binary) {
+  int32 version = 2,  // must be in sync with 'version' in Write.
+      version_in = 1;  // defaults to 1 if no version specified.
+
   ExpectToken(is, binary, "<NnetComputation>");
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<Version>") {
+    ReadBasicType(is, binary, &version_in);
+    ExpectToken(is, binary, "<NumMatrices>");
+  } else {
+    KALDI_ASSERT(token == "<NumMatrices>");
+  }
+  if (version_in != version) {
+    KALDI_ERR << "Reading NnetComputation failed because version in "
+              << version_in << " != " << version << "... you can "
+              << "ignore this error if the program continues afterward, "
+              << "it would only affect speed.";
+  }
   size_t num_matrices;
-  ExpectToken(is, binary, "<NumMatrices>");
   ReadBasicType(is, binary, &num_matrices);
   KALDI_ASSERT(num_matrices >= 0);
   matrices.resize(num_matrices);
@@ -691,25 +728,44 @@ void NnetComputation::Read(std::istream &is, bool binary) {
   }
 
 
+  // delete any existing pointers in component_precomputed_indexes.
+  // note: component_precomputed_indexes[0] is the NULL pointer.
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i].data;
+  component_precomputed_indexes.clear();
+
   size_t num_component_precomputed_indexes;
   ExpectToken(is, binary, "<NumComponentPrecomputedIndexes>");
   ReadBasicType(is, binary, &num_component_precomputed_indexes);
   KALDI_ASSERT(num_component_precomputed_indexes >= 0);
   component_precomputed_indexes.resize(num_component_precomputed_indexes);
-  ExpectToken(is, binary, "<ComponentPrecomputedIndexes>");
-  std::vector<ComponentPrecomputedIndexes*> component_precomputed_indexes_tmp;
-  for (size_t c = 0; c < num_component_precomputed_indexes; c++) {
-    bool is_null; // a boolean indicating whether the pointer should be NULL.
-    ReadBasicType(is, binary, &is_null);
-    if (!is_null) {
+
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<ComponentPrecomputedIndexes>") {
+    // Older on-disk format, before that code was extended for shortcut
+    // compilation.
+    component_precomputed_indexes.clear();
+    component_precomputed_indexes.resize(num_component_precomputed_indexes);
+    for (size_t c = 0; c < num_component_precomputed_indexes; c++) {
+      bool is_null; // a boolean indicating whether the pointer should be NULL.
+      ReadBasicType(is, binary, &is_null);
+      if (!is_null) {
+        ComponentPrecomputedIndexes* p = ComponentPrecomputedIndexes::ReadNew(is, binary);
+        component_precomputed_indexes[c].data = p;
+      }
+    }
+  } else {
+    KALDI_ASSERT(tok == "<PrecomputedIndexesInfo>");
+    for (size_t c = 1; c < num_component_precomputed_indexes; c++) {
       ComponentPrecomputedIndexes* p = ComponentPrecomputedIndexes::ReadNew(is, binary);
-      component_precomputed_indexes_tmp.push_back(p);
-    } else {
-      component_precomputed_indexes_tmp.push_back(NULL);
+      KALDI_ASSERT(p != NULL);
+      PrecomputedIndexesInfo &info = component_precomputed_indexes[c];
+      info.data = p;
+      ReadIndexVector(is, binary, &(info.input_indexes));
+      ReadIndexVector(is, binary, &(info.output_indexes));
     }
   }
-  component_precomputed_indexes = component_precomputed_indexes_tmp;
-
   size_t num_indexes;
   ExpectToken(is, binary, "<NumIndexes>");
   ReadBasicType(is, binary, &num_indexes);
@@ -740,21 +796,6 @@ void NnetComputation::Read(std::istream &is, bool binary) {
     ReadIntegerPairVector(is, binary, &(indexes_ranges[c]));
   }
 
-  size_t num_input_output_info;
-  ExpectToken(is, binary, "<NumInputOutputInfo>");
-  ReadBasicType(is, binary, &num_input_output_info);
-  KALDI_ASSERT(num_input_output_info >= 0);
-  input_output_info.clear();
-  ExpectToken(is, binary, "<InputOutputInfo>");
-  for (size_t c = 0; c < num_input_output_info; c++) {
-    int32 key;
-    std::pair<int32, int32> val;
-    ReadBasicType(is, binary, &key);
-    ReadBasicType(is, binary, &(val.first));
-    ReadBasicType(is, binary, &(val.second));
-    input_output_info.insert(std::pair<int32, std::pair<int32, int32> >(key, val));
-  }
-
   size_t num_commands;
   ExpectToken(is, binary, "<NumCommands>");
   ReadBasicType(is, binary, &num_commands);
@@ -773,7 +814,10 @@ void NnetComputation::Read(std::istream &is, bool binary) {
 }
 
 void NnetComputation::Write(std::ostream &os, bool binary) const {
+  int32 version = 2;  // Must be in sync with version in Read.
   WriteToken(os, binary, "<NnetComputation>");
+  WriteToken(os, binary, "<Version>");
+  WriteBasicType(os, binary, version);
   WriteToken(os, binary, "<NumMatrices>");
   WriteBasicType(os, binary, matrices.size());
   WriteToken(os, binary, "<Matrices>");
@@ -800,14 +844,12 @@ void NnetComputation::Write(std::ostream &os, bool binary) const {
   if (!binary) os << std::endl;
   WriteToken(os, binary, "<NumComponentPrecomputedIndexes>");
   WriteBasicType(os, binary, component_precomputed_indexes.size());
-  WriteToken(os, binary, "<ComponentPrecomputedIndexes>");
-  for (size_t c = 0; c < component_precomputed_indexes.size(); c++) {
-    if (component_precomputed_indexes[c] != NULL) {
-      WriteBasicType(os, binary, false); // a boolean indicating whether the pointer is NULL.
-      component_precomputed_indexes[c]->Write(os, binary);
-    } else {
-      WriteBasicType(os, binary, true);
-    }
+  WriteToken(os, binary, "<PrecomputedIndexesInfo>");
+  for (size_t c = 1; c < component_precomputed_indexes.size(); c++) {
+    const PrecomputedIndexesInfo &info = component_precomputed_indexes[c];
+    info.data->Write(os, binary);
+    WriteIndexVector(os, binary, info.input_indexes);
+    WriteIndexVector(os, binary, info.output_indexes);
   }
 
   if (!binary) os << std::endl;
@@ -834,18 +876,6 @@ void NnetComputation::Write(std::ostream &os, bool binary) const {
     WriteIntegerPairVector(os, binary, indexes_ranges[c]);
   }
 
-  if (!binary) os << std::endl;
-  WriteToken(os, binary, "<NumInputOutputInfo>");
-  WriteBasicType(os, binary, input_output_info.size());
-  WriteToken(os, binary, "<InputOutputInfo>");
-  std::map<int32, std::pair<int32, int32> > input_output_info_cp(input_output_info.begin(), input_output_info.end());
-  for (std::map<int32, std::pair<int32, int32> >::const_iterator iter =
-           input_output_info_cp.begin(); iter != input_output_info_cp.end(); ++iter) {
-    WriteBasicType(os, binary, iter->first);
-    WriteBasicType(os, binary, iter->second.first);
-    WriteBasicType(os, binary, iter->second.second);
-  }
-
   if (!binary) os << std::endl;
   WriteToken(os, binary, "<NumCommands>");
   WriteBasicType(os, binary, commands.size());
@@ -1055,42 +1085,68 @@ NnetComputation::NnetComputation(const NnetComputation &other):
     matrices(other.matrices),
     matrix_debug_info(other.matrix_debug_info),
     submatrices(other.submatrices),
+    component_precomputed_indexes(other.component_precomputed_indexes),
     indexes(other.indexes),
     indexes_multi(other.indexes_multi),
     indexes_ranges(other.indexes_ranges),
-    input_output_info(other.input_output_info),
     commands(other.commands),
     need_model_derivative(other.need_model_derivative),
     indexes_cuda(other.indexes_cuda),
     indexes_ranges_cuda(other.indexes_ranges_cuda) {
-  for (size_t i = 0; i < other.component_precomputed_indexes.size(); i++)
-      component_precomputed_indexes.push_back(
-          other.component_precomputed_indexes[i] == NULL ? NULL :
-          other.component_precomputed_indexes[i]->Copy());
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    component_precomputed_indexes[i].data =
+        component_precomputed_indexes[i].data->Copy();
 }
 
-
 NnetComputation& NnetComputation::operator = (const NnetComputation &other) {
-    matrices = other.matrices;
-    matrix_debug_info = other.matrix_debug_info;
-    submatrices = other.submatrices;
-    indexes = other.indexes;
-    indexes_multi = other.indexes_multi;
-    indexes_ranges = other.indexes_ranges;
-    input_output_info = other.input_output_info;
-    commands = other.commands;
-    need_model_derivative = other.need_model_derivative;
-    indexes_cuda = other.indexes_cuda;
-    indexes_ranges_cuda = other.indexes_ranges_cuda;
-
-    for (size_t i = 0; i < component_precomputed_indexes.size(); i++)
-      delete component_precomputed_indexes[i];
-    component_precomputed_indexes.clear();
-    for (size_t i = 0; i < other.component_precomputed_indexes.size(); i++)
-      component_precomputed_indexes.push_back(
-          other.component_precomputed_indexes[i] == NULL ? NULL :
-          other.component_precomputed_indexes[i]->Copy());
-    return *this;
+  matrices = other.matrices;
+  matrix_debug_info = other.matrix_debug_info;
+  submatrices = other.submatrices;
+  indexes = other.indexes;
+  indexes_multi = other.indexes_multi;
+  indexes_ranges = other.indexes_ranges;
+  commands = other.commands;
+  need_model_derivative = other.need_model_derivative;
+  indexes_cuda = other.indexes_cuda;
+  indexes_ranges_cuda = other.indexes_ranges_cuda;
+
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    delete component_precomputed_indexes[i].data;
+  component_precomputed_indexes = other.component_precomputed_indexes;
+  for (size_t i = 1; i < component_precomputed_indexes.size(); i++)
+    component_precomputed_indexes[i].data =
+        component_precomputed_indexes[i].data->Copy();
+  return *this;
+}
+
+
+void NnetComputation::GetWholeSubmatrices(
+    std::vector<int32> *whole_submatrices) const {
+  int32 num_matrices = matrices.size(),
+      num_submatrices = submatrices.size();
+  whole_submatrices->clear();
+  whole_submatrices->resize(num_matrices, 0);
+  for (int32 s = 1; s < num_submatrices; s++) {
+    if (IsWholeMatrix(s)) {
+      int32 m = submatrices[s].matrix_index;
+      (*whole_submatrices)[m] = s;
+    }
+  }
+  for (int32 m = 1; m < num_matrices; m++) {
+    KALDI_ASSERT((*whole_submatrices)[m] != 0 &&
+                 "Matrix exists with no submatrix that is "
+                 "the whole of it.");
+  }
+}
+
+size_t IoSpecificationHasher::operator () (
+    const IoSpecification &io_spec) const noexcept {
+  StringHasher string_hasher;
+  IndexVectorHasher indexes_hasher;
+  // 4261 was chosen at random from a list of primes.
+  return string_hasher(io_spec.name) +
+      indexes_hasher(io_spec.indexes) +
+      (io_spec.has_deriv ? 4261 : 0);
 }
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 0d0b13547bf..623e136dd43 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -65,8 +65,10 @@ struct MiscComputationInfo {
 // produce.  For inputs, the name should correspond to an input or component
 // node name in the nnet (components are allowed so context can be provided in
 // recurrent setups); for outputs, the name should be an output node name in the
-// Nnet.  In the normal case there will just be one input and one output, and
-// the indexes will vary only in the t index, with the others all identical.
+// Nnet.
+// note: this structure is used to represent egs both before and after merging
+// into minibatches; if this merging has been done, the indexes will vary in
+// the 'n' dimension.
 struct IoSpecification {
   std::string name;
   std::vector<Index> indexes;
@@ -91,12 +93,16 @@ struct IoSpecification {
   void Swap(IoSpecification *other);
 
   void Read(std::istream &istream, bool binary);
-  
+
   void Write(std::ostream &ostream, bool binary) const;
-  
+
   bool operator== (const IoSpecification &other) const;
 };
 
+struct IoSpecificationHasher {
+  size_t operator () (const IoSpecification &io_spec) const noexcept;
+};
+
 
 // struct ComputationRequest is whatever we need in addition to the
 // network itself in order to create the structure of a computation.  The most
@@ -147,7 +153,7 @@ struct ComputationRequest {
   void Read(std::istream &istream, bool binary);
 
   void Write(std::ostream &ostream, bool binary) const;
-  
+
   bool operator== (const ComputationRequest &other) const;
 };
 
@@ -158,14 +164,21 @@ struct ComputationRequest {
    the NnetComputation.  We declare it outside that class because it's so
    frequently used and we got tired of typing NnetComputation:: everywhere.
    We document the commands here.
-
-   - kAllocMatrixUndefined: Allocate a matrix.  arg1 = index of matrix.
-   - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = index of matrix.
-   - kDeallocMatrix: Deallocate a matrix.  arg1 = index of matrix.
-   - kAllocMatrixFromOther: initialize matrix indexed arg1 using memory
-   from matrix indexed arg2 (using shallow swap).
-   - kAllocMatrixFromOtherZeroed: initialize matrix indexed arg1 using memory
-     from matrix indexed arg2 (using shallow swap), then zero the matrix
+   Note: for operations that naturally need to operate on entire matrices
+   (i.e. allocation commands and input and output commands), we use the
+   submatrix indexes of them, which turns out to be more convenient for
+   optimization; but these submatrix indexes must refer to the whole of
+   a matrix.
+
+   - kAllocMatrixUndefined: Allocate a matrix.  arg1 = submatrix index.
+   - kAllocMatrixZeroed: Allocate and zero a matrix.  arg1 = submatrix index.
+   - kDeallocMatrix: Deallocate a matrix.  arg1 = submatrix index.
+   - kAllocMatrixFromOther: initialize matrix with submatrix index arg1 using memory
+     from matrix with submatrix index arg2 (using shallow swap).  Note: the
+     code relating to the 'looped' computation relies on the fact that this is
+     a swap, so kSwapMatrix might be a better name, but we're keeping the old name.
+   - kAllocMatrixFromOtherZeroed: initialize matrix with submatrix index arg1 using memory
+     from matrix with submatrix index arg2 (using shallow swap), then zero the matrix
      we just allocated.
    - kPropagate: Forward computation of neural net, see Component::Propagate()
      - arg1 is is component-index in neural net
@@ -204,8 +217,25 @@ struct ComputationRequest {
    - kAddRowRanges: call \ref CuMatrix::AddRowRanges() "AddRowRanges()"
      on sub-matrix arg1, with arg2 as source sub-matrix, and indexes given
      indexes_ranges[arg3].
+   - kAcceptInput: accepts a matrix of input from the user, which may be either
+     features, or derivatives w.r.t. the output.  arg1 is the submatrix index of
+     a whole matrix that the input goes to, and arg2 is the index of the network
+     node associated with it (e.g. the node of "input" or "ivector"), for
+     puroses of double checking.
+   - kProvideOutput: outputs a matrix to the user: either a network output, or a
+     matrix of derivatives w.r.t. an input.  arg1 is the submatrix index of the
+     output (which we expect to be a whole matrix), arg2 is the index of the
+     network node associated with it (e.g. the node for "output").
    - kNoOperation: does nothing (sometimes useful during optimization)
-   - kNoOperationMarker: does nothing, but used to mark end of forward commands.
+   - kNoOperationMarker: does nothing, but used to mark end of a block
+     of commands (like forward commands).
+   - kNoOperationLabel: does nothing, but is the destination for
+     the kGotoLabel command.
+   - kGotoLabel: jumps to the kNoOperationLabel command.  arg1 must
+     be set to the location of that command.  Since there are no
+     conditionals, this should be the last command, as remaining
+     commands will be unreachable.
+
 */
 enum CommandType {
   kAllocMatrixUndefined, kAllocMatrixZeroed,
@@ -213,7 +243,9 @@ enum CommandType {
   kPropagate, kStoreStats, kBackprop, kBackpropNoModelUpdate,
   kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
   kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
-  kAddRowRanges, kNoOperation, kNoOperationMarker };
+  kAddRowRanges, kAcceptInput, kProvideOutput,
+  kNoOperation, kNoOperationMarker, kNoOperationLabel, kGotoLabel };
+
 
 
 // struct NnetComputation defines the specific steps of a neural-net
@@ -270,11 +302,30 @@ struct NnetComputation {
     void Read(std::istream &istream, bool binary);
     void Write(std::ostream &ostream, bool binary) const;
   };
+  struct PrecomputedIndexesInfo {
+    // For each step of the computation for which we might possibly need to store
+    // a ComponentPrecomputedIndexes object (and note that this is only applicable
+    // for non-simple Components), this struct stores some information.
+    // The primary data is in 'data', it's an object of type inheriting from
+    // ComponentPrecomputedIndexes.
+    // The 'input_indexes' and 'output_indexes' are the vectors that were provided
+    // to the function Component::PrecomputeIndexes() when generating these
+    // PrecomputedIndexes objects.  They currently only stored in cases where
+    // the 'n' values in the computation are numbered only zero and one, because
+    // these types of computations are compiled in 'shortcut' compilation, and
+    // in that case we'll need these indexes later in order to generate the
+    // 'expanded' computation (see the function ExpandComputation()).
+    ComponentPrecomputedIndexes *data;
+    std::vector<Index> input_indexes;
+    std::vector<Index> output_indexes;
+    PrecomputedIndexesInfo(): data(NULL) { }
+  };
+
 
   // "matrices" describes the sizes of the matrices that we use as variables in
-  // the computation [note: index zero is reserved for an empty matrix].  Most
-  // commands refer to submatrices below (note: each matrix will have its own
-  // sub-matrix that just refers to the entire matrix).
+  // the computation [note: index zero is reserved for an empty matrix].  Note:
+  // we generally don't refer to matrices, even ones known to be whole matrices,
+  // using their matrix index directly, but via their submatrix indexes.
   std::vector<MatrixInfo> matrices;
 
   // debug information for each of the matrices (indexed by matrix-index), only
@@ -297,7 +348,7 @@ struct NnetComputation {
   // the NULL pointer, which is used for "simple" components and others that do
   // not require precomputed indexes.
   // These are owned here.
-  std::vector<ComponentPrecomputedIndexes*> component_precomputed_indexes;
+  std::vector<PrecomputedIndexesInfo> component_precomputed_indexes;
 
   // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows.  contains row-indexes.
   std::vector<std::vector<int32> > indexes;
@@ -312,11 +363,11 @@ struct NnetComputation {
   // end-index)
   std::vector<std::vector<std::pair<int32,int32> > > indexes_ranges;
 
-  // Information about where the values and derivatives of inputs and outputs of
-  // the neural net live.  Indexed by the node_index (the same index as used for
-  // the nodes_ array in the Nnet), each pair is (value_matrix_index,
-  // deriv_matrix_index), with 0 for derivatives that are not present.
-  unordered_map<int32, std::pair<int32, int32> > input_output_info;
+//   // Information about where the values and derivatives of inputs and outputs of
+//   // the neural net live.  Indexed by the node_index (the same index as used for
+//   // the nodes_ array in the Nnet), each pair is (value_matrix_index,
+//   // deriv_matrix_index), with 0 for derivatives that are not present.
+//   unordered_map<int32, std::pair<int32, int32> > input_output_info;
 
   // The sequence of commands.
   std::vector<Command> commands;
@@ -369,6 +420,11 @@ struct NnetComputation {
   void GetSubmatrixStrings(const Nnet &nnet,
                            std::vector<std::string> *submat_strings) const;
 
+  // This function outputs a vector, indexed by matrix index, that gives you for
+  // each matrix, the index of a submatrix which refers to the whole of that
+  // matrix; it makes sure that each matrix has such a submatrix.
+  void GetWholeSubmatrices(std::vector<int32> *whole_submatrices) const;
+
 
   // This function outputs information similar to Print(), but outputs the
   // preamble as a string and a vector of strings, one per command (with no
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index 6cdde0015f2..c6a271abfbe 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -24,6 +24,8 @@
 #include "nnet3/nnet-test-utils.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/decodable-simple-looped.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -71,13 +73,64 @@ void UnitTestComputationRequestIo(ComputationRequest *request) {
   }
 }
 
-void TestNnetDecodable(const ComputationRequest &request,
-                       const std::vector<Matrix<BaseFloat> > &inputs,
-                       const Nnet &nnet,
-                       const CuMatrixBase<BaseFloat> &reference_output) {
-  // DecodableAmNnetSimpleOptions opts;
-  // This is a placeholder for where we'll eventually test either the decodable
-  // object or something similar to it (e.g. a base class)
+// this checks that a couple of different decodable objects give the same
+// answer.
+void TestNnetDecodable(Nnet *nnet) {
+  int32 num_frames = 5 + RandInt(1, 100),
+      input_dim = nnet->InputDim("input"),
+      output_dim = nnet->OutputDim("output"),
+      ivector_dim = std::max<int32>(0, nnet->InputDim("ivector"));
+  Matrix<BaseFloat> input(num_frames, input_dim);
+
+
+  input.SetRandn();
+  Vector<BaseFloat> ivector(ivector_dim);
+  ivector.SetRandn();
+
+  Vector<BaseFloat> priors(RandInt(0, 1) == 0 ? output_dim : 0);
+  if (priors.Dim() != 0) {
+    priors.SetRandn();
+    priors.ApplyExp();
+  }
+
+  Matrix<BaseFloat> output1(num_frames, output_dim),
+      output2(num_frames, output_dim);
+
+  {
+    NnetSimpleComputationOptions opts;
+    opts.frames_per_chunk = RandInt(5, 25);
+    CachingOptimizingCompiler compiler(*nnet);
+    DecodableNnetSimple decodable(opts, *nnet, priors, input, &compiler,
+                                  (ivector_dim != 0 ? &ivector : NULL));
+    for (int32 t = 0; t < num_frames; t++) {
+      SubVector<BaseFloat> row(output1, t);
+      decodable.GetOutputForFrame(t, &row);
+    }
+  }
+
+  {
+    NnetSimpleLoopedComputationOptions opts;
+    // caution: this may modify nnet, by changing how it consumes iVectors.
+    DecodableNnetSimpleLoopedInfo info(opts, priors, nnet);
+    DecodableNnetSimpleLooped decodable(info, input,
+                                        (ivector_dim != 0 ? &ivector : NULL));
+    for (int32 t = 0; t < num_frames; t++) {
+      SubVector<BaseFloat> row(output2, t);
+      decodable.GetOutputForFrame(t, &row);
+    }
+  }
+
+
+  if (!NnetIsRecurrent(*nnet) &&
+      nnet->Info().find("statistics-extraction") == std::string::npos) {
+    // this equivalence will not hold for recurrent nnets or those that
+    // have the statistics-extraction/statistics-pooling layers.
+    for (int32 t = 0; t < num_frames; t++) {
+      SubVector<BaseFloat> row1(output1, t),
+          row2(output2, t);
+      KALDI_ASSERT(row1.ApproxEqual(row2));
+    }
+  }
 }
 
 void UnitTestNnetCompute() {
@@ -119,7 +172,9 @@ void UnitTestNnetCompute() {
     if (RandInt(0, 1) == 0) {
       NnetOptimizeOptions opt_config;
 
-      Optimize(opt_config, nnet, request, &computation);
+      Optimize(opt_config, nnet,
+               MaxOutputTimeInRequest(request),
+               &computation);
       {
         std::ostringstream os;
         computation.Print(os, nnet);
@@ -142,26 +197,26 @@ void UnitTestNnetCompute() {
       KALDI_LOG << "Input sum is " << temp.Sum();
       computer.AcceptInput(request.inputs[i].name, &temp);
     }
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
 
-    TestNnetDecodable(request, inputs, nnet, output);
-
     KALDI_LOG << "Output sum is " << output.Sum();
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     output_deriv.SetRandn();
     // output_deriv sum won't be informative so don't print it.
-    if (request.outputs[0].has_deriv)
-      computer.AcceptOutputDeriv("output", &output_deriv);
-    computer.Backward();
-    for (size_t i = 0; i < request.inputs.size(); i++) {
-      if (request.inputs[i].has_deriv) {
-        const CuMatrixBase<BaseFloat> &in_deriv =
-            computer.GetInputDeriv(request.inputs[i].name);
-        KALDI_LOG << "Input-deriv sum for input '"
-                  << request.inputs[i].name << "' is " << in_deriv.Sum();
+    if (request.outputs[0].has_deriv) {
+      computer.AcceptInput("output", &output_deriv);
+      computer.Run();
+      for (size_t i = 0; i < request.inputs.size(); i++) {
+        if (request.inputs[i].has_deriv) {
+          const CuMatrixBase<BaseFloat> &in_deriv =
+              computer.GetOutput(request.inputs[i].name);
+          KALDI_LOG << "Input-deriv sum for input '"
+                    << request.inputs[i].name << "' is " << in_deriv.Sum();
+        }
       }
     }
+    TestNnetDecodable(&nnet);
   }
 }
 
@@ -171,7 +226,7 @@ void UnitTestNnetCompute() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
-  //SetVerboseLevel(2);
+  // SetVerboseLevel(4);
 
 
   for (kaldi::int32 loop = 0; loop < 2; loop++) {
@@ -189,4 +244,3 @@ int main() {
 
   return 0;
 }
-
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 34f5df523f1..f15b2883989 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -30,7 +30,7 @@ NnetComputer::NnetComputer(const NnetComputeOptions &options,
                            const Nnet &nnet,
                            Nnet *nnet_to_update):
     options_(options), computation_(computation), nnet_(nnet),
-    nnet_to_update_(nnet_to_update) {
+    program_counter_(0), nnet_to_update_(nnet_to_update) {
   KALDI_ASSERT(computation.indexes_cuda.size() == computation.indexes.size() &&
  computation.indexes_ranges_cuda.size() == computation.indexes_ranges.size() &&
                "You must call NnetComputation::ComputeCudaIndexes() before "
@@ -145,36 +145,44 @@ void NnetComputer::DebugAfterExecute(int32 command,
 }
 
 
-void NnetComputer::ExecuteCommand(int32 command) {
-  const NnetComputation::Command &c = computation_.commands[command];
+void NnetComputer::ExecuteCommand() {
+  const NnetComputation::Command &c = computation_.commands[program_counter_];
+  int32 m1, m2;
   try {
     switch (c.command_type) {
       case kAllocMatrixZeroed:
-        matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
-                                 computation_.matrices[c.arg1].num_cols,
-                                 kSetZero,
-                                 computation_.matrices[c.arg1].stride_type);
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        matrices_[m1].Resize(computation_.matrices[m1].num_rows,
+                             computation_.matrices[m1].num_cols,
+                             kSetZero,
+                             computation_.matrices[m1].stride_type);
         break;
       case kAllocMatrixUndefined:
-        matrices_[c.arg1].Resize(computation_.matrices[c.arg1].num_rows,
-                                 computation_.matrices[c.arg1].num_cols,
-                                 kUndefined,
-                                 computation_.matrices[c.arg1].stride_type);
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        matrices_[m1].Resize(computation_.matrices[m1].num_rows,
+                             computation_.matrices[m1].num_cols,
+                             kUndefined,
+                             computation_.matrices[m1].stride_type);
         break;
       case kDeallocMatrix:
-        matrices_[c.arg1].Resize(0, 0);
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        matrices_[m1].Resize(0, 0);
         break;
       case kAllocMatrixFromOther:
-        matrices_[c.arg1].Swap(&(matrices_[c.arg2]));
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        m2 = computation_.submatrices[c.arg2].matrix_index;
+        matrices_[m1].Swap(&(matrices_[m2]));
         break;
       case kAllocMatrixFromOtherZeroed:
-        matrices_[c.arg1].Swap(&(matrices_[c.arg2]));
-        matrices_[c.arg1].SetZero();
+        m1 = computation_.submatrices[c.arg1].matrix_index;
+        m2 = computation_.submatrices[c.arg2].matrix_index;
+        matrices_[m1].Swap(&(matrices_[m2]));
+        matrices_[m1].SetZero();
         break;
       case kPropagate: {
         const Component *component = nnet_.GetComponent(c.arg1);
         ComponentPrecomputedIndexes *indexes =
-            computation_.component_precomputed_indexes[c.arg2];
+            computation_.component_precomputed_indexes[c.arg2].data;
         const CuSubMatrix<BaseFloat> input(GetSubMatrix(c.arg3));
         CuSubMatrix<BaseFloat> output(GetSubMatrix(c.arg4));
         component->Propagate(indexes, input, &output);
@@ -200,7 +208,7 @@ void NnetComputer::ExecuteCommand(int32 command) {
                                     nnet_to_update_->GetComponent(c.arg1) :
                                     NULL);
         ComponentPrecomputedIndexes *indexes =
-            computation_.component_precomputed_indexes[c.arg2];
+            computation_.component_precomputed_indexes[c.arg2].data;
         const CuSubMatrix<BaseFloat> in_value(GetSubMatrix(c.arg3));
         const CuSubMatrix<BaseFloat> out_value(GetSubMatrix(c.arg4));
         const CuSubMatrix<BaseFloat> out_deriv(GetSubMatrix(c.arg5));
@@ -271,7 +279,11 @@ void NnetComputer::ExecuteCommand(int32 command) {
         dest.AddRowRanges(src, pairs);
         break;
       }
-      case kNoOperation: case kNoOperationMarker:
+      case kNoOperation: case kNoOperationMarker: case kNoOperationLabel:
+        break;
+      case kGotoLabel:
+        KALDI_ASSERT(computation_.commands[c.arg1].command_type == kNoOperationLabel);
+        program_counter_ = c.arg1;
         break;
       default:
         KALDI_ERR << "Invalid command in computation";
@@ -282,12 +294,12 @@ void NnetComputer::ExecuteCommand(int32 command) {
       computation_.GetCommandStrings(nnet_, &preamble, &command_strings_);
       KALDI_WARN << "Printing some background info since error was detected";
       KALDI_LOG << preamble;
-      for (int32 prev_c = 0; prev_c < command; prev_c++)
+      for (int32 prev_c = 0; prev_c < program_counter_; prev_c++)
         KALDI_LOG << command_strings_[prev_c];
     }
     // the following will re-throw the error, but now we've printed more info
     // about what went wrong.
-    KALDI_ERR << "Error running command " << command_strings_[command];
+    KALDI_ERR << "Error running command " << command_strings_[program_counter_];
   }
 }
 
@@ -352,69 +364,56 @@ void NnetComputer::GetPointers(int32 indexes_multi_index,
               reinterpret_cast<CuArray<BaseFloat*>*>(pointers));
 }
 
-void NnetComputer::Forward() {
-  CheckInputs(false);
-  int32 size = computation_.commands.size(), i = 0;
+void NnetComputer::Run() {
   const std::vector<NnetComputation::Command> &c = computation_.commands;
-  CommandDebugInfo info;
-  Timer timer;
-  double total_elapsed_previous = 0.0;
-
-  for (; i < size && c[i].command_type != kNoOperationMarker;
-       i++) {
-    if (debug_)
-      DebugBeforeExecute(i, &info);
-    ExecuteCommand(i);
-    if (debug_) {
-      double total_elapsed_now = timer.Elapsed();
-      DebugAfterExecute(i, info, total_elapsed_now - total_elapsed_previous);
-      total_elapsed_previous = total_elapsed_now;
-    }
-
-  }
-
-}
+  int32 num_commands = c.size();
 
+  if (program_counter_ >= num_commands)
+    KALDI_ERR << "Running computation that has already finished.";
+  CheckNoPendingIo();
 
-void NnetComputer::Backward() {
-  CheckInputs(true);
-  int32 size = computation_.commands.size(), i = 0;
-  const std::vector<NnetComputation::Command> &c = computation_.commands;
-  for (; i < size && c[i].command_type != kNoOperationMarker;
-       i++);
   CommandDebugInfo info;
   Timer timer;
   double total_elapsed_previous = 0.0;
 
-  for (; i < size; i++) {
+  for (; program_counter_ < num_commands; program_counter_++) {
+    if (c[program_counter_].command_type == kAcceptInput ||
+        c[program_counter_].command_type == kProvideOutput) {
+      // We have hit a part of the computation that requires user
+      // interaction, e.g. the end of the forward or backward phase.
+      break;
+    }
     if (debug_)
-      DebugBeforeExecute(i, &info);
-    ExecuteCommand(i);
+      DebugBeforeExecute(program_counter_, &info);
+    ExecuteCommand();
     if (debug_) {
       double total_elapsed_now = timer.Elapsed();
-      DebugAfterExecute(i, info, total_elapsed_now - total_elapsed_previous);
+      DebugAfterExecute(program_counter_, info,
+                        total_elapsed_now - total_elapsed_previous);
       total_elapsed_previous = total_elapsed_now;
     }
   }
 }
 
-void NnetComputer::AcceptInput(const std::string &input_name,
+void NnetComputer::AcceptInput(const std::string &node_name,
                                CuMatrix<BaseFloat> *input) {
-  bool is_output = false, is_deriv = false;
-  int32 matrix_index = GetMatrixIndex(input_name, is_output, is_deriv);
-  KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
+  bool is_output = false;
+  int32 matrix_index = GetIoMatrixIndex(node_name, is_output);
+
   const NnetComputation::MatrixInfo &matrix_info =
       computation_.matrices[matrix_index];
-  if (input->NumRows() != matrix_info.num_rows)
-    KALDI_ERR << "Num-rows mismatch for input '" << input_name
+  if (input->NumRows() != matrix_info.num_rows) {
+    KALDI_ERR << "Num-rows mismatch for input '" << node_name
               << "': " << matrix_info.num_rows
               <<  " in computation-request, " << input->NumRows()
               << " provided.";
-  if (input->NumCols() != matrix_info.num_cols)
-    KALDI_ERR << "Num-cols mismatch for input '" << input_name
+  }
+  if (input->NumCols() != matrix_info.num_cols) {
+    KALDI_ERR << "Num-cols mismatch for input '" << node_name
               << "': " << matrix_info.num_cols
               <<  " in computation-request, " << input->NumCols()
               << " provided.";
+  }
   if (matrix_info.stride_type == kDefaultStride ||
       input->Stride() == input->NumCols()) {
     matrices_[matrix_index].Swap(input);
@@ -423,130 +422,95 @@ void NnetComputer::AcceptInput(const std::string &input_name,
                                    matrix_info.num_cols,
                                    kUndefined, kStrideEqualNumCols);
     matrices_[matrix_index].CopyFromMat(*input);
+    input->Resize(0, 0);
   }
-  input->Resize(0, 0);
-}
-
-const CuMatrixBase<BaseFloat> &NnetComputer::GetInputDeriv(
-    const std::string &input_name) const {
-  bool is_output = false, is_deriv = true;
-  int32 matrix_index = GetMatrixIndex(input_name, is_output, is_deriv);
-  if (matrices_[matrix_index].NumRows() == 0)
-    KALDI_ERR << "GetInputDeriv called before it is ready (before Backward()?)";
-  return matrices_[matrix_index];
 }
 
-
 const CuMatrixBase<BaseFloat> &NnetComputer::GetOutput(
-    const std::string &output_name) const {
-  bool is_output = true, is_deriv = false;
-  int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
-  if (matrices_[matrix_index].NumRows() == 0)
-    KALDI_ERR << "GetOutput called when output not ready (before Forward()?)";
+    const std::string &node_name) {
+  bool is_output = true;
+  int32 matrix_index = GetIoMatrixIndex(node_name, is_output);
+  KALDI_ASSERT(matrices_[matrix_index].NumRows() != 0);
   return matrices_[matrix_index];
 }
 
-void NnetComputer::GetOutputDestructive(
-    const std::string &output_name,
-    CuMatrix<BaseFloat> *output) {
-  bool is_output = true, is_deriv = false;
-  int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
-  if (matrices_[matrix_index].NumRows() == 0)
-    KALDI_ERR << "GetOutput called when output not ready (before Forward()?)";
-  output->Resize(0, 0);
+
+void NnetComputer::GetOutputDestructive(const std::string &node_name,
+                                        CuMatrix<BaseFloat> *output) {
+  bool is_output = true;
+  int32 matrix_index = GetIoMatrixIndex(node_name, is_output);
+  KALDI_ASSERT(matrices_[matrix_index].NumRows() != 0);
   matrices_[matrix_index].Swap(output);
+  matrices_[matrix_index].Resize(0, 0);
 }
 
 
-void NnetComputer::AcceptOutputDeriv(const std::string &output_name,
-                                     CuMatrix<BaseFloat> *output_deriv) {
-  bool is_output = true, is_deriv = true;
-  int32 matrix_index = GetMatrixIndex(output_name, is_output, is_deriv);
-  KALDI_ASSERT(static_cast<size_t>(matrix_index) < matrices_.size());
-  const NnetComputation::MatrixInfo &matrix_info =
-      computation_.matrices[matrix_index];
-  if (output_deriv->NumRows() != matrix_info.num_rows)
-    KALDI_ERR << "Num-rows mismatch for output-deriv '" << output_name
-              << "': " << matrix_info.num_rows
-              <<  " in computation-request, " << output_deriv->NumRows()
-              << " provided.";
-  if (output_deriv->NumCols() != matrix_info.num_cols)
-    KALDI_ERR << "Num-cols mismatch for output_deriv '" << output_name
-              << "': " << matrix_info.num_cols
-              <<  " in computation-request, " << output_deriv->NumCols()
-              << " provided.";
-  if (matrix_info.stride_type == kDefaultStride ||
-      output_deriv->Stride() == output_deriv->NumCols()) {
-    matrices_[matrix_index].Swap(output_deriv);
-  } else {
-    matrices_[matrix_index].Resize(matrix_info.num_rows,
-                                   matrix_info.num_cols,
-                                   kUndefined, kStrideEqualNumCols);
-    matrices_[matrix_index].CopyFromMat(*output_deriv);
+void NnetComputer::CheckNoPendingIo() {
+  const std::vector<NnetComputation::Command> &c = computation_.commands;
+  while (program_counter_ < static_cast<int32>(c.size()) &&
+         (c[program_counter_].command_type == kAcceptInput ||
+          c[program_counter_].command_type == kProvideOutput)) {
+    pending_commands_.push_back(program_counter_);
+    program_counter_++;
+  }
+  for (size_t i = 0; i < pending_commands_.size(); i++) {
+    // the order here doesn't really matter; we go from back to front
+    // as it's more efficient, not that efficiency really matters here.
+    int32 command = pending_commands_[i];
+    if (c[command].command_type == kAcceptInput) {
+      // we can't ignore if we needed input from the user that hasn't been
+      // provided.
+      int32 node = c[command].arg2;
+      KALDI_ERR << "Cannot run computation-- we did not get input for node '"
+                << nnet_.GetNodeName(node) << "'";
+    }
   }
-  output_deriv->Resize(0, 0);
+  pending_commands_.clear();
 }
 
-int32 NnetComputer::GetMatrixIndex(
-    const std::string &node_name, bool is_output, bool is_deriv) const {
+int32 NnetComputer::GetIoMatrixIndex(const std::string &node_name, bool is_output) {
+  const std::vector<NnetComputation::Command> &c = computation_.commands;
   int32 node_index = nnet_.GetNodeIndex(node_name);
   if (node_index == -1)
     KALDI_ERR << "No node named '" << node_name << "'in network.";
-  if (is_output) {
-    if (!nnet_.IsOutputNode(node_index))
-      KALDI_ERR << "Expecting output node; node named '"
-                << node_name  << "' is not output node.";
-  } else {
-    if (nnet_.IsOutputNode(node_index))
-      KALDI_ERR << "Expecting input node or component node; node named '"
-                << node_name  << "' is output node.";
-  }
-  unordered_map<int32, std::pair<int32, int32> >::const_iterator
-      iter = computation_.input_output_info.find(node_index),
-      end = computation_.input_output_info.end();
-  if (iter == end)
-    KALDI_ERR << "Not expecting input or output for node named '" << node_name
-              << "' (not in computation request)";
-  std::pair<int32,int32> locations = iter->second;
-  int32 location;
-  if (is_deriv) {
-    location = locations.second;
-    if (locations.second <= 0) // No deriv expected.
-      KALDI_ERR << "Not expecting derivative information for node named '"
-                << node_name << "' (not in computation request)";
-  } else {
-    location = locations.first;
+  // first make sure all the I/O commands that we immediately expect, are listed
+  // in 'pending_commands_'.
+  while (program_counter_ < static_cast<int32>(computation_.commands.size()) &&
+         ((c[program_counter_].command_type == kAcceptInput ||
+           c[program_counter_].command_type == kProvideOutput ||
+           c[program_counter_].command_type == kNoOperationMarker))) {
+    if (c[program_counter_].command_type != kNoOperationMarker)
+      pending_commands_.push_back(program_counter_);
+    program_counter_++;
   }
-  KALDI_ASSERT(static_cast<size_t>(location) < matrices_.size());
-  return location;
-}
-
-void NnetComputer::CheckInputs(bool check_output_deriv) const {
-  unordered_map<int32, std::pair<int32, int32> >::const_iterator
-      iter = computation_.input_output_info.begin(),
-      end = computation_.input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 node_index = iter->first,
-      value_matrix_index = iter->second.first,
-      deriv_matrix_index = iter->second.second;
-    std::string name = nnet_.GetNodeName(node_index);
-    if (nnet_.IsOutputNode(node_index)) {
-      if (check_output_deriv && deriv_matrix_index > 0) {
-        KALDI_ASSERT(static_cast<size_t>(deriv_matrix_index) < matrices_.size());
-        if (matrices_[deriv_matrix_index].NumRows() == 0)
-          KALDI_ERR << "Output-derivative required but not provided for node '"
-                    << name << "'.";
-      }
-    } else {
-      if (!check_output_deriv) {
-        if (matrices_[value_matrix_index].NumRows() == 0)
-          KALDI_ERR << "Input required but not provided for node '"
-                    << name << "'.";
+  for (size_t i = 0; i < pending_commands_.size(); i++) {
+    int32 command = pending_commands_[i];
+    bool this_command_is_output =
+        (c[command].command_type == kProvideOutput);
+    int32 this_submatrix_index = c[command].arg1,
+        this_node_index = c[command].arg2;
+    if (this_command_is_output == is_output && node_index == this_node_index) {
+      if (!is_output) {
+        pending_commands_.erase(pending_commands_.begin() + i);
+        // don't erase the command for outputs, as that would prevent things
+        // from being output twice, which is an unnecessary restriction.
       }
+      if (!(computation_.IsWholeMatrix(this_submatrix_index)))
+        KALDI_ERR << "Getting input or output that is not a whole matrix "
+                  << "(probably some optimization code needs to be changed)";
+      return computation_.submatrices[this_submatrix_index].matrix_index;
     }
   }
+  // if you get the following error it will likely be a bug in the calling code,
+  // or possibly due to giving the wrong egs.
+  KALDI_ERR << "Could not "
+            << (is_output ? "provide output " : "accept input ")
+            << "for network node " << node_name
+            << " (it is not expected at this point in the computation)";
+  return 0;  // Suppress compiler warnings; this line will never be reached.
 }
 
+
 void NnetComputer::AcceptInputs(const Nnet &nnet,
                                 const std::vector<NnetIo> &io_vec) {
   for (size_t i = 0; i < io_vec.size(); i++) {
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index abf7a0df12c..0f7da2e01be 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -53,8 +53,8 @@ struct NnetComputeOptions {
   "computation" object.
 
   You call in sequence, the constructor, then AcceptInput() [or AcceptInputs()],
-  then Forward(), then GetOutput(), then if applicable (Backward(), then if
-  applicable GetInputDeriv()).
+  then Run(), then GetOutput() [and if applicable, AcceptOutputDeriv], then if
+  there is a backward computation, Run() [then, if applicable, GetInputDeriv()].
  */
 class NnetComputer {
  public:
@@ -67,53 +67,55 @@ class NnetComputer {
                const Nnet &nnet,
                Nnet *nnet_to_update);
 
-  /// e.g. AcceptInput ("input", input_mat).  Will crash if there is no
-  /// input node with the given name.  This function is destructive of "input"
-  /// as it takes it using the Swap function of CuMatrix.
-  /// Must have the same number of rows as the corresponding input described
-  /// in the ComputationRequest e.g. the indexes.size() in the corresponding
+  /// e.g. AcceptInput ("input", &input_mat), or for derivatives w.r.t. the
+  /// output, AcceptInput("output", output_deriv_mat).  Will crash if there is
+  /// no input or output node with the given name.  This function is destructive
+  /// of "input" as it takes it using the Swap function of CuMatrix.  Must have
+  /// the same number of rows as the corresponding input described in the
+  /// ComputationRequest e.g. the indexes.size() in the corresponding
   /// IoSpecification.
-  void AcceptInput(const std::string &input_name,
+  void AcceptInput(const std::string &node_name,
                    CuMatrix<BaseFloat> *input);
 
-  /// This function calls AcceptInput() in turn on all the inputs in the
-  /// training example (provide example.io; this interface makes it easy to work
-  /// with CCTC examples too).  It needs "nnet" only in order to distinguish
-  /// inputs from outputs.
+  /// This convenience function calls AcceptInput() in turn on all the inputs in
+  /// the training example.  It needs "nnet" only in order to distinguish inputs
+  /// from outputs.
   void AcceptInputs(const Nnet &nnet,
                     const std::vector<NnetIo> &io);
 
 
-  // Does the forward computation.
-  void Forward();
+  /// This does either the forward or backward computation, depending
+  /// when it is called (in a typical computation, the first time you call
+  /// this it will do the forward computation; then you'll take the outputs
+  /// and provide derivatives; and the second time you call it, it will do
+  /// the backward computation.  There used to be two separate functions
+  /// Forward() and Backward().
+  void Run();
 
-  // e.g. GetOutput ("output").  Will crash if no such output.
-  const CuMatrixBase<BaseFloat> &GetOutput(const std::string &output_name) const;
+  // e.g. GetOutput("output").  This function can also be used to get
+  // derivatives w.r.t. inputs.  It's non-const because it may only
+  // be called once and it keeps track of that.
+  const CuMatrixBase<BaseFloat> &GetOutput(const std::string &node_name);
 
   // Version of GetOutput that calls Swap(), destroying the output stored inside
   // this object.  You should probably not use this if you plan to call
-  // Backward() on the same NnetComputer object, it may lead to a crash.
+  // Backward() on the same NnetComputer object, or it's a recurret
+  // computation-- it may lead to a crash.
   void GetOutputDestructive(const std::string &output_name,
                             CuMatrix<BaseFloat> *output);
 
-  /// e.g. AcceptOutputDeriv("output", &output_deriv_mat).
-  void AcceptOutputDeriv(const std::string &output_name,
-                         CuMatrix<BaseFloat> *output_deriv);
-
-
-  // Does the backward computation.
-  void Backward();
-
-  // e.g. GetInputDeriv ("input").  Will crash if no such input derivative.
-  // You may only call this if you requested this input derivative in the
-  // ComputationRequest.
-  const CuMatrixBase<BaseFloat> &GetInputDeriv(
-      const std::string &input_name) const;
 
  private:
   const NnetComputeOptions &options_;
   const NnetComputation &computation_;
   const Nnet &nnet_;
+  int32 program_counter_;  // command index to execute next.
+  // To deal with inputs and outputs that are not provided/taken by the user in
+  // the same order as listed in the computation, pending_commands_ contains a
+  // list of program commands that were skipped over but are in the queue to be
+  // executed.
+  std::vector<int32> pending_commands_;
+
   Nnet *nnet_to_update_;
   bool debug_;
   // command_attributes_ is only used if debug_=true.
@@ -126,15 +128,26 @@ class NnetComputer {
   // The matrices used in the computation.
   std::vector<CuMatrix<BaseFloat> > matrices_;
 
-  // executes the command in computation_.commands[command].
-  void ExecuteCommand(int32 command);
 
-  // Returns the matrix index where the input or output matrix index for
-  // "node_name" is stored (or its corresponding derivative, if is_deriv==true).
-  // "is_output" tells the code that this is an output node, as opposed to an
-  // input node; it's used only for checking.
-  int32 GetMatrixIndex(const std::string &node_name,
-                       bool is_output, bool is_deriv) const;
+  // executes the command in computation_.commands[program_counter_].
+  void ExecuteCommand();
+
+  // Returns the matrix index where the input (if is_output==false) or output
+  // matrix index for "node_name" is stored.  This looks at the next command (at
+  // program_counter_) and in pending_commands_, and sees whether we were
+  // expecting any input or output for this node, and if there is a match,
+  // returns it and "consumes" the command by either advancing program_counter_
+  // or consuming something from pending_commands_.
+  // If there is not a match (i.e. we were not expecting this type of I/O
+  // at this point in the computation), it prints an error and dies.
+  int32 GetIoMatrixIndex(const std::string &node_name, bool is_output);
+
+
+  // This function, called from Run(), checks that there is no pending I/O
+  // that we were waiting for, that would block the running of the
+  // computation; it crashes if there was pending input, and ignores and
+  // skips over any pending output.
+  void CheckNoPendingIo();
 
   CuSubMatrix<BaseFloat> GetSubMatrix(int32 submatrix_index);
 
@@ -145,11 +158,6 @@ class NnetComputer {
                    int32 num_cols,
                    CuArray<const BaseFloat*> *pointers);
 
-  // with check_output_deriv = false, checks we have all inputs.
-  // with check_output_deriv = true, checks we have all required output-derivs.
-  void CheckInputs(bool check_output_deriv) const;
-
-
   struct CommandDebugInfo {
     // Uncentered standard deviations of elements of all matrices that this
     // command writes.  Dimension is the same as
diff --git a/src/nnet3/nnet-derivative-test.cc b/src/nnet3/nnet-derivative-test.cc
index a12ca2ae0af..3a974fa0b6d 100644
--- a/src/nnet3/nnet-derivative-test.cc
+++ b/src/nnet3/nnet-derivative-test.cc
@@ -425,11 +425,10 @@ void UnitTestNnetInputDerivatives() {
 int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
+  kaldi::int32 loop = 0;
   //SetVerboseLevel(2);
-
-
-  for (kaldi::int32 loop = 0; loop < 2; loop++) {
 #if HAVE_CUDA == 1
+  for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
       CuDevice::Instantiate().SelectGpuId("no");
@@ -438,10 +437,11 @@ int main() {
 #endif
     UnitTestNnetModelDerivatives();
     UnitTestNnetInputDerivatives();
-  }
-
-  KALDI_LOG << "Nnet tests succeeded.";
+#if HAVE_CUDA == 1
+  } // No for loop if 'HAVE_CUDA != 1',
+  CuDevice::Instantiate().PrintProfile();
+#endif
+  KALDI_LOG << "Nnet derivative tests succeeded.";
 
   return 0;
 }
-
diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc
index 162a55b8149..d02bc49a5af 100644
--- a/src/nnet3/nnet-descriptor.cc
+++ b/src/nnet3/nnet-descriptor.cc
@@ -107,9 +107,9 @@ void OffsetForwardingDescriptor::GetNodeDependencies(
 }
 
 Cindex OffsetForwardingDescriptor::MapToInput(const Index &ind) const {
-  Cindex answer = src_->MapToInput(ind);
-  answer.second = answer.second + offset_;
-  return answer;
+  Index ind_mod(ind);
+  ind_mod += offset_;
+  return src_->MapToInput(ind_mod);
 }
 
 
@@ -173,12 +173,13 @@ void RoundingForwardingDescriptor::GetNodeDependencies(
 
 Cindex RoundingForwardingDescriptor::MapToInput(const Index &ind) const {
   KALDI_ASSERT(t_modulus_ >= 1);
-  Cindex ans = src_->MapToInput(ind);
-  int32 mod = ans.second.t % t_modulus_;
+  Index ind_mod(ind);
+  // unfortunately doing "mathematical" modulus is a bit painful in C.
+  int32 mod = ind_mod.t % t_modulus_;
   if (mod < 0)
     mod += t_modulus_;
-  ans.second.t -= mod;
-  return ans;
+  ind_mod.t -= mod;
+  return src_->MapToInput(ind_mod);
 }
 
 ForwardingDescriptor *RoundingForwardingDescriptor::Copy() const {
@@ -199,15 +200,15 @@ void ReplaceIndexForwardingDescriptor::GetNodeDependencies(
 }
 
 Cindex ReplaceIndexForwardingDescriptor::MapToInput(const Index &ind) const {
-  Cindex ans = src_->MapToInput(ind);
+  Index ind_mod(ind);
   switch (variable_name_) {
-    case kT: ans.second.t = value_; break;
-    case kX: ans.second.x = value_; break;
+    case kT: ind_mod.t = value_; break;
+    case kX: ind_mod.x = value_; break;
     default:  // kN or any other value is not allowed (doesn't make sense
       // to change the minibatch index in this way).
       KALDI_ERR << "Invalid variable name";
   }
-  return ans;
+  return src_->MapToInput(ind_mod);
 }
 
 ForwardingDescriptor *ReplaceIndexForwardingDescriptor::Copy() const {
diff --git a/src/nnet3/nnet-descriptor.h b/src/nnet3/nnet-descriptor.h
index 93650e84307..e2d2c41772d 100644
--- a/src/nnet3/nnet-descriptor.h
+++ b/src/nnet3/nnet-descriptor.h
@@ -70,7 +70,7 @@ namespace nnet3 {
 ;; arguments
 <descriptor>  ::=   Switch(<descriptor>, <descriptor> [, <descriptor> ...])
 ;; For use in clockwork RNNs or similar, Round() rounds the time-index t of the
-;; requested Index to the next-lowest multiple of the integer <t-modulus>
+;; requested Index to the next-lowest multiple of the integer <t-modulus>,
 ;; and evaluates the input argument for the resulting Index.
 <descriptor>  ::=   Round(<descriptor>, <t-modulus>)  ;; <t-modulus> is an integer
 ;; ReplaceIndex replaces some <variable-name> (t or x) in the requested Index
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index 7f7d485ffe0..302e2cbfa50 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -28,12 +28,12 @@ NnetComputeProb::NnetComputeProb(const NnetComputeProbOptions &config,
     config_(config),
     nnet_(nnet),
     deriv_nnet_(NULL),
-    compiler_(nnet),
+    compiler_(nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0) {
   if (config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
-    bool is_gradient = true;  // force simple update
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_); // force simple update
   }
 }
 
@@ -52,8 +52,8 @@ void NnetComputeProb::Reset() {
   objf_info_.clear();
   accuracy_info_.clear();
   if (deriv_nnet_) {
-    bool is_gradient = true;
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
   }
 }
 
@@ -69,10 +69,10 @@ void NnetComputeProb::Compute(const NnetExample &eg) {
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(nnet_, eg.io);
-  computer.Forward();
+  computer.Run();
   this->ProcessOutputs(eg, &computer);
   if (config_.compute_deriv)
-    computer.Backward();
+    computer.Run();
 }
 
 void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h
index 298548857dd..fd2ceb1df9e 100644
--- a/src/nnet3/nnet-diagnostics.h
+++ b/src/nnet3/nnet-diagnostics.h
@@ -46,6 +46,7 @@ struct NnetComputeProbOptions {
   bool compute_accuracy;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
+  CachingOptimizingCompilerOptions compiler_config;
   NnetComputeProbOptions():
       debug_computation(false),
       compute_deriv(false),
@@ -60,7 +61,9 @@ struct NnetComputeProbOptions {
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
     optimize_config.Register(&optimization_opts);
-
+    // register the compiler options with the prefix "compiler".
+    ParseOptions compiler_opts("compiler", opts);
+    compiler_config.Register(&compiler_opts);
     // register the compute options with the prefix "computation".
     ParseOptions compute_opts("computation", opts);
     compute_config.Register(&compute_opts);
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
index 10f0811c12e..f23af549d72 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.cc
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -42,8 +42,8 @@ NnetDiscriminativeComputeObjf::NnetDiscriminativeComputeObjf(
   log_priors_.ApplyLog();
   if (nnet_config_.compute_deriv) {
     deriv_nnet_ = new Nnet(nnet_);
-    bool is_gradient = true;  // force simple update
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_); // force simple update
   }
 }
 
@@ -61,8 +61,8 @@ void NnetDiscriminativeComputeObjf::Reset() {
   num_minibatches_processed_ = 0;
   objf_info_.clear();
   if (deriv_nnet_) {
-    bool is_gradient = true;
-    SetZero(is_gradient, deriv_nnet_);
+    ScaleNnet(0.0, deriv_nnet_);
+    SetNnetAsGradient(deriv_nnet_);
   }
 }
 
@@ -73,7 +73,7 @@ void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg)
       use_xent_derivative = false;
 
   ComputationRequest request;
-  GetDiscriminativeComputationRequest(nnet_, eg, 
+  GetDiscriminativeComputationRequest(nnet_, eg,
                                       need_model_derivative,
                                       store_component_stats,
                                       use_xent_regularization, use_xent_derivative,
@@ -83,10 +83,10 @@ void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg)
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(nnet_, eg.inputs);
-  computer.Forward();
+  computer.Run();
   this->ProcessOutputs(eg, &computer);
   if (nnet_config_.compute_deriv)
-    computer.Backward();
+    computer.Run();
 }
 
 void NnetDiscriminativeComputeObjf::ProcessOutputs(
@@ -104,7 +104,7 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
       KALDI_ERR << "Network has no output named " << sup.name;
 
     const CuMatrixBase<BaseFloat> &nnet_output = computer->GetOutput(sup.name);
-    
+
     bool use_xent = (discriminative_config_.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> nnet_output_deriv, xent_deriv;
@@ -112,18 +112,18 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
     if (nnet_config_.compute_deriv)
       nnet_output_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                                kUndefined);
-    
+
     if (use_xent)
       xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
                         kUndefined);
 
     if (objf_info_.count(sup.name) == 0)
-      objf_info_.insert(std::make_pair(sup.name, 
+      objf_info_.insert(std::make_pair(sup.name,
           discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
 
     discriminative::DiscriminativeObjectiveInfo *stats = &(objf_info_[sup.name]);
 
-    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_config_, 
+    discriminative::ComputeDiscriminativeObjfAndDeriv(discriminative_config_,
                                                       tmodel_, log_priors_,
                                                       sup.supervision, nnet_output,
                                                       stats,
@@ -132,11 +132,11 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
                                                       (use_xent ? &xent_deriv : NULL));
 
     if (nnet_config_.compute_deriv)
-      computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
-    
+      computer->AcceptInput(sup.name, &nnet_output_deriv);
+
     if (use_xent) {
       if (objf_info_.count(xent_name) == 0)
-        objf_info_.insert(std::make_pair(xent_name, 
+        objf_info_.insert(std::make_pair(xent_name,
           discriminative::DiscriminativeObjectiveInfo(discriminative_config_)));
       discriminative::DiscriminativeObjectiveInfo &xent_stats = objf_info_[xent_name];
 
@@ -149,7 +149,7 @@ void NnetDiscriminativeComputeObjf::ProcessOutputs(
       xent_stats.tot_t_weighted += stats->tot_t_weighted;
       xent_stats.tot_objf += xent_objf;
     }
-    
+
     num_minibatches_processed_++;
   }
 }
@@ -168,21 +168,21 @@ bool NnetDiscriminativeComputeObjf::PrintTotalStats() const {
     BaseFloat tot_weight = info.tot_t_weighted;
     BaseFloat tot_objective = info.TotalObjf(
         discriminative_config_.criterion);
-    
+
     info.PrintAll(discriminative_config_.criterion);
 
     if (info.tot_l2_term == 0.0) {
       KALDI_LOG << "Overall " << discriminative_config_.criterion
                 << " objective for '"
                 << name << "' is "
-                << (tot_objective / tot_weight) 
+                << (tot_objective / tot_weight)
                 << " per frame, "
                 << "over " << tot_weight << " frames.";
     } else {
       KALDI_LOG << "Overall " << discriminative_config_.criterion
                 << " objective for '"
                 << name << "' is "
-                << (tot_objective / tot_weight) 
+                << (tot_objective / tot_weight)
                 << " + " << (info.tot_l2_term / tot_weight)
                 << " per frame, "
                 << "over " << tot_weight << " frames.";
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index e9a063e268e..61a9669fb76 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -91,7 +91,7 @@ NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(const NnetDiscrimin
 NnetDiscriminativeSupervision::NnetDiscriminativeSupervision(
     const std::string &name,
     const discriminative::DiscriminativeSupervision &supervision,
-    const Vector<BaseFloat> &deriv_weights,
+    const VectorBase<BaseFloat> &deriv_weights,
     int32 first_frame,
     int32 frame_skip):
     name(name),
@@ -249,13 +249,15 @@ void MergeSupervision(
 }
 
 
-void MergeDiscriminativeExamples(bool compress,
-                        std::vector<NnetDiscriminativeExample> *input,
-                        NnetDiscriminativeExample *output) {
+void MergeDiscriminativeExamples(
+    bool compress,
+    std::vector<NnetDiscriminativeExample> *input,
+    NnetDiscriminativeExample *output) {
   int32 num_examples = input->size();
   KALDI_ASSERT(num_examples > 0);
-  // we temporarily make the input-features in 'input' look like regular NnetExamples,
-  // so that we can recycle the MergeExamples() function.
+  // we temporarily make the input-features in 'input' look like regular
+  // NnetExamples, so that we can recycle the
+  // MergeExamples() function.
   std::vector<NnetExample> eg_inputs(num_examples);
   for (int32 i = 0; i < num_examples; i++)
     eg_inputs[i].io.swap((*input)[i].inputs);
@@ -283,27 +285,6 @@ void MergeDiscriminativeExamples(bool compress,
   }
 }
 
-void TruncateDerivWeights(int32 truncate,
-                          NnetDiscriminativeExample *eg) {
-  for (size_t i = 0; i < eg->outputs.size(); i++) {
-    NnetDiscriminativeSupervision &supervision = eg->outputs[i];
-    Vector<BaseFloat> &deriv_weights = supervision.deriv_weights;
-    if (deriv_weights.Dim() == 0) {
-      deriv_weights.Resize(supervision.indexes.size());
-      deriv_weights.Set(1.0);
-    }
-    int32 num_sequences = supervision.supervision.num_sequences,
-       frames_per_sequence = supervision.supervision.frames_per_sequence;
-    KALDI_ASSERT(2 * truncate  < frames_per_sequence);
-    for (int32 t = 0; t < truncate; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-    for (int32 t = frames_per_sequence - truncate;
-         t < frames_per_sequence; t++)
-      for (int32 s = 0; s < num_sequences; s++)
-        deriv_weights(t * num_sequences + s) = 0.0;
-  }
-}
 
 void GetDiscriminativeComputationRequest(const Nnet &nnet,
                                          const NnetDiscriminativeExample &eg,
@@ -347,7 +328,7 @@ void GetDiscriminativeComputationRequest(const Nnet &nnet,
     io_spec.name = name;
     io_spec.indexes = sup.indexes;
     io_spec.has_deriv = need_model_derivative;
-    
+
     if (use_xent_regularization) {
       size_t cur_size = request->outputs.size();
       request->outputs.resize(cur_size + 1);
@@ -414,6 +395,160 @@ void ShiftDiscriminativeExampleTimes(int32 frame_shift,
   }
 }
 
+size_t NnetDiscriminativeExampleStructureHasher::operator () (
+    const NnetDiscriminativeExample &eg) const noexcept {
+  // these numbers were chosen at random from a list of primes.
+  NnetIoStructureHasher io_hasher;
+  size_t size = eg.inputs.size(), ans = size * 35099;
+  for (size_t i = 0; i < size; i++)
+    ans = ans * 19157 + io_hasher(eg.inputs[i]);
+  for (size_t i = 0; i < eg.outputs.size(); i++) {
+    const NnetDiscriminativeSupervision &sup = eg.outputs[i];
+    StringHasher string_hasher;
+    IndexVectorHasher indexes_hasher;
+    ans = ans * 17957 +
+        string_hasher(sup.name) + indexes_hasher(sup.indexes);
+  }
+  return ans;
+}
+
+bool NnetDiscriminativeExampleStructureCompare::operator () (
+    const NnetDiscriminativeExample &a,
+    const NnetDiscriminativeExample &b) const {
+  NnetIoStructureCompare io_compare;
+  if (a.inputs.size() != b.inputs.size() ||
+      a.outputs.size() != b.outputs.size())
+    return false;
+  size_t size = a.inputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (!io_compare(a.inputs[i], b.inputs[i]))
+      return false;
+  size = a.outputs.size();
+  for (size_t i = 0; i < size; i++)
+    if (a.outputs[i].name != b.outputs[i].name ||
+        a.outputs[i].indexes != b.outputs[i].indexes)
+      return false;
+  return true;
+}
+
+
+int32 GetNnetDiscriminativeExampleSize(const NnetDiscriminativeExample &a) {
+  int32 ans = 0;
+  for (size_t i = 0; i < a.inputs.size(); i++) {
+    int32 s = a.inputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  for (size_t i = 0; i < a.outputs.size(); i++) {
+    int32 s = a.outputs[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  return ans;
+}
+
+
+DiscriminativeExampleMerger::DiscriminativeExampleMerger(const ExampleMergingConfig &config,
+                             NnetDiscriminativeExampleWriter *writer):
+    finished_(false), num_egs_written_(0),
+    config_(config), writer_(writer) { }
+
+
+void DiscriminativeExampleMerger::AcceptExample(NnetDiscriminativeExample *eg) {
+  KALDI_ASSERT(!finished_);
+  // If an eg with the same structure as 'eg' is already a key in the
+  // map, it won't be replaced, but if it's new it will be made
+  // the key.  Also we remove the key before making the vector empty.
+  // This way we ensure that the eg in the key is always the first
+  // element of the vector.
+  std::vector<NnetDiscriminativeExample*> &vec = eg_to_egs_[eg];
+  vec.push_back(eg);
+  int32 eg_size = GetNnetDiscriminativeExampleSize(*eg),
+      num_available = vec.size();
+  bool input_ended = false;
+  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
+                                               input_ended);
+  if (minibatch_size != 0) {  // we need to write out a merged eg.
+    KALDI_ASSERT(minibatch_size == num_available);
+
+    std::vector<NnetDiscriminativeExample*> vec_copy(vec);
+    eg_to_egs_.erase(eg);
+
+    // MergeDiscriminativeExamples() expects a vector of NnetDiscriminativeExample, not of pointers,
+    // so use swap to create that without doing any real work.
+    std::vector<NnetDiscriminativeExample> egs_to_merge(minibatch_size);
+    for (int32 i = 0; i < minibatch_size; i++) {
+      egs_to_merge[i].Swap(vec_copy[i]);
+      delete vec_copy[i];  // we owned those pointers.
+    }
+    WriteMinibatch(&egs_to_merge);
+  }
+}
+
+void DiscriminativeExampleMerger::WriteMinibatch(
+    std::vector<NnetDiscriminativeExample> *egs) {
+  KALDI_ASSERT(!egs->empty());
+  int32 eg_size = GetNnetDiscriminativeExampleSize((*egs)[0]);
+  NnetDiscriminativeExampleStructureHasher eg_hasher;
+  size_t structure_hash = eg_hasher((*egs)[0]);
+  int32 minibatch_size = egs->size();
+  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
+  NnetDiscriminativeExample merged_eg;
+  MergeDiscriminativeExamples(config_.compress, egs, &merged_eg);
+  std::ostringstream key;
+  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  writer_->Write(key.str(), merged_eg);
+}
+
+void DiscriminativeExampleMerger::Finish() {
+  if (finished_) return;  // already finished.
+  finished_ = true;
+
+  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
+  // iterator invalidation problems.
+  std::vector<std::vector<NnetDiscriminativeExample*> > all_egs;
+  all_egs.reserve(eg_to_egs_.size());
+
+  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
+  for (; iter != end; ++iter)
+    all_egs.push_back(iter->second);
+  eg_to_egs_.clear();
+
+  for (size_t i = 0; i < all_egs.size(); i++) {
+    int32 minibatch_size;
+    std::vector<NnetDiscriminativeExample*> &vec = all_egs[i];
+    KALDI_ASSERT(!vec.empty());
+    int32 eg_size = GetNnetDiscriminativeExampleSize(*(vec[0]));
+    bool input_ended = true;
+    while (!vec.empty() &&
+           (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
+                                                   input_ended)) != 0) {
+      // MergeDiscriminativeExamples() expects a vector of
+      // NnetDiscriminativeExample, not of pointers, so use swap to create that
+      // without doing any real work.
+      std::vector<NnetDiscriminativeExample> egs_to_merge(minibatch_size);
+      for (int32 i = 0; i < minibatch_size; i++) {
+        egs_to_merge[i].Swap(vec[i]);
+        delete vec[i];  // we owned those pointers.
+      }
+      vec.erase(vec.begin(), vec.begin() + minibatch_size);
+      WriteMinibatch(&egs_to_merge);
+    }
+    if (!vec.empty()) {
+      int32 eg_size = GetNnetDiscriminativeExampleSize(*(vec[0]));
+      NnetDiscriminativeExampleStructureHasher eg_hasher;
+      size_t structure_hash = eg_hasher(*(vec[0]));
+      int32 num_discarded = vec.size();
+      stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
+      for (int32 i = 0; i < num_discarded; i++)
+        delete vec[i];
+      vec.clear();
+    }
+  }
+  stats_.PrintStats();
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
-
diff --git a/src/nnet3/nnet-discriminative-example.h b/src/nnet3/nnet-discriminative-example.h
index b2458b0cdcd..c0ea446552e 100644
--- a/src/nnet3/nnet-discriminative-example.h
+++ b/src/nnet3/nnet-discriminative-example.h
@@ -26,6 +26,7 @@
 #include "util/table-types.h"
 #include "nnet3/discriminative-supervision.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 #include "hmm/posterior.h"
 #include "hmm/transition-model.h"
 
@@ -42,7 +43,7 @@ struct NnetDiscriminativeSupervision {
   // the name of the output in the neural net; in simple setups it
   // will just be "output".
   std::string name;
-  
+
   // The indexes that the output corresponds to.  The size of this vector will
   // be equal to supervision.num_sequences * supervision.frames_per_sequence.
   // Be careful about the order of these indexes-- it is a little confusing.
@@ -52,7 +53,7 @@ struct NnetDiscriminativeSupervision {
   // This is done to make the code similar that for the 'chain' model.
   std::vector<Index> indexes;
 
-  // The supervision object, containing the numerator and denominator 
+  // The supervision object, containing the numerator and denominator
   // lattices.
   discriminative::DiscriminativeSupervision supervision;
 
@@ -68,19 +69,19 @@ struct NnetDiscriminativeSupervision {
   // so it's equivalent to a vector of all ones.  This vector is written
   // to disk compactly as unsigned char.
   Vector<BaseFloat> deriv_weights;
-  
+
   // Use default assignment operator
   NnetDiscriminativeSupervision() { }
 
   // Initialize the object from an object of type discriminative::Supervision,
-  // and some extra information.  
+  // and some extra information.
   // Note: you probably want to set 'name' to "output".
   // 'first_frame' will often be zero but you can choose (just make it
   // consistent with how you numbered your inputs), and 'frame_skip' would be 1
   // in a vanilla setup, but 3 in the case of 'chain' models
   NnetDiscriminativeSupervision(const std::string &name,
                                 const discriminative::DiscriminativeSupervision &supervision,
-                                const Vector<BaseFloat> &deriv_weights,
+                                const VectorBase<BaseFloat> &deriv_weights,
                                 int32 first_frame,
                                 int32 frame_skip);
 
@@ -89,15 +90,15 @@ struct NnetDiscriminativeSupervision {
   void Write(std::ostream &os, bool binary) const;
 
   void Read(std::istream &is, bool binary);
-  
+
   void Swap(NnetDiscriminativeSupervision *other);
 
   void CheckDim() const;
-  
+
   bool operator == (const NnetDiscriminativeSupervision &other) const;
 };
 
-/// NnetDiscriminativeExample is like NnetExample, but specialized for 
+/// NnetDiscriminativeExample is like NnetExample, but specialized for
 /// sequence training.
 struct NnetDiscriminativeExample {
 
@@ -111,7 +112,7 @@ struct NnetDiscriminativeExample {
   std::vector<NnetDiscriminativeSupervision> outputs;
 
   void Write(std::ostream &os, bool binary) const;
-  
+
   void Read(std::istream &is, bool binary);
 
   void Swap(NnetDiscriminativeExample *other);
@@ -128,10 +129,36 @@ struct NnetDiscriminativeExample {
   }
 };
 
-/** 
-  Appends the given vector of examples (which must be non-empty) into 
+
+/// This hashing object hashes just the structural aspects of the NnetExample
+/// without looking at the value of the features.  It will be used in combining
+/// egs into batches of all similar structure.
+struct NnetDiscriminativeExampleStructureHasher {
+  size_t operator () (const NnetDiscriminativeExample &eg) const noexcept ;
+  // We also provide a version of this that works from pointers.
+  size_t operator () (const NnetDiscriminativeExample *eg) const noexcept {
+    return (*this)(*eg);
+  }
+};
+
+
+/// This comparator object compares just the structural aspects of the
+/// NnetDiscriminativeExample without looking at the value of the features.
+struct NnetDiscriminativeExampleStructureCompare {
+  bool operator () (const NnetDiscriminativeExample &a,
+                    const NnetDiscriminativeExample &b) const;
+  // We also provide a version of this that works from pointers.
+  bool operator () (const NnetDiscriminativeExample *a,
+                    const NnetDiscriminativeExample *b) const {
+    return (*this)(*a, *b);
+  }
+};
+
+
+/**
+  Appends the given vector of examples (which must be non-empty) into
   a single output example.
-  Intended to be used when forming minibatches for neural net training. If 
+  Intended to be used when forming minibatches for neural net training. If
   'compress' it compresses the output features (recommended to save disk
   space).
 
@@ -140,16 +167,15 @@ struct NnetDiscriminativeExample {
   MergeExamples() routine while avoiding having to rewrite code.
 */
 void MergeDiscriminativeExamples(
-    bool compress,
     std::vector<NnetDiscriminativeExample> *input,
+    bool compress,
     NnetDiscriminativeExample *output);
 
 // called from MergeDiscriminativeExamples, this function merges the Supervision
 // objects into one.  Requires (and checks) that they all have the same name.
-
 void MergeSupervision(
     const std::vector<const NnetDiscriminativeSupervision*> &inputs,
-    NnetDiscriminativeSupervision *output); 
+    NnetDiscriminativeSupervision *output);
 
 
 /** Shifts the time-index t of everything in the input of "eg" by adding
@@ -170,16 +196,7 @@ void ShiftDiscriminativeExampleTimes(int32 frame_shift,
                                     const std::vector<std::string> &exclude_names,
                                     NnetDiscriminativeExample *eg);
 
-/**
-   This sets to zero any elements of 'egs->outputs[*].deriv_weights' that correspond
-   to frames within the first or last 'truncate' frames of the sequence (e.g. you could
-   set 'truncate=5' to set zero deriv-weight for the first and last 5 frames of the
-   sequence).
- */
-void TruncateDerivWeights(int32 truncate,
-                          NnetDiscriminativeExample *eg);
-
-/**  This function takes a NnetDiscriminativeExample and produces a 
+/**  This function takes a NnetDiscriminativeExample and produces a
      ComputationRequest.
      Assumes you don't want the derivatives w.r.t. the inputs; if you do, you
      can create the ComputationRequest manually.  Assumes that if
@@ -194,13 +211,64 @@ void GetDiscriminativeComputationRequest(const Nnet &nnet,
                                          bool use_xent_derivative,
                                          ComputationRequest *computation_request);
 
-
 typedef TableWriter<KaldiObjectHolder<NnetDiscriminativeExample > > NnetDiscriminativeExampleWriter;
 typedef SequentialTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > SequentialNnetDiscriminativeExampleReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetDiscriminativeExample > > RandomAccessNnetDiscriminativeExampleReader;
 
+
+/// This function returns the 'size' of a discriminative example as defined for
+/// purposes of merging egs, which is defined as the largest number of Indexes
+/// in any of the inputs or outputs of the example.
+int32 GetDiscriminativeNnetExampleSize(const NnetDiscriminativeExample &a);
+
+
+/// This class is responsible for arranging examples in groups that have the
+/// same strucure (i.e. the same input and output indexes), and outputting them
+/// in suitable minibatches as defined by ExampleMergingConfig.
+class DiscriminativeExampleMerger {
+ public:
+  DiscriminativeExampleMerger(const ExampleMergingConfig &config,
+                              NnetDiscriminativeExampleWriter *writer);
+
+  // This function accepts an example, and if possible, writes a merged example
+  // out.  The ownership of the pointer 'a' is transferred to this class when
+  // you call this function.
+  void AcceptExample(NnetDiscriminativeExample *a);
+
+  // This function announces to the class that the input has finished, so it
+  // should flush out any smaller-sized minibatches, as dictated by the config.
+  // This will be called in the destructor, but you can call it explicitly when
+  // all the input is done if you want to; it won't repeat anything if called
+  // twice.  It also prints the stats.
+  void Finish();
+
+  // returns a suitable exit status for a program.
+  int32 ExitStatus() { Finish(); return (num_egs_written_ > 0 ? 0 : 1); }
+
+  ~DiscriminativeExampleMerger() { Finish(); };
+ private:
+  // called by Finish() and AcceptExample().  Merges, updates the stats, and
+  // writes.  The 'egs' is non-const only because the egs are temporarily
+  // changed inside MergeDiscriminativeEgs.  The pointer 'egs' is still owned
+  // by the caller.
+  void WriteMinibatch(std::vector<NnetDiscriminativeExample> *egs);
+
+  bool finished_;
+  int32 num_egs_written_;
+  const ExampleMergingConfig &config_;
+  NnetDiscriminativeExampleWriter *writer_;
+  ExampleMergingStats stats_;
+
+  // Note: the "key" into the egs is the first element of the vector.
+  typedef unordered_map<NnetDiscriminativeExample*,
+                        std::vector<NnetDiscriminativeExample*>,
+                        NnetDiscriminativeExampleStructureHasher,
+                        NnetDiscriminativeExampleStructureCompare> MapType;
+   MapType eg_to_egs_;
+};
+
+
 } // namespace nnet3
 } // namespace kaldi
 
 #endif // KALDI_NNET3_NNET_DISCRIMINATIVE_EXAMPLE_H_
-
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 865056f3569..0a436b69f8c 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -42,9 +42,7 @@ NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
     KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
                  opts.nnet_config.max_param_change >= 0.0);
     delta_nnet_ = nnet_->Copy();
-    bool is_gradient = false;  // setting this to true would disable the
-                               // natural-gradient updates.
-    SetZero(is_gradient, delta_nnet_);
+    ScaleNnet(0.0, delta_nnet_);
   }
   if (opts.nnet_config.read_cache != "") {
     bool binary;
@@ -57,7 +55,7 @@ NnetDiscriminativeTrainer::NnetDiscriminativeTrainer(
       KALDI_WARN << "Could not open cached computation. "
                     "Probably this is the first training iteration.";
     }
-  } 
+  }
   log_priors_.ApplyLog();
 }
 
@@ -79,10 +77,10 @@ void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
                         (delta_nnet_ == NULL ? nnet_ : delta_nnet_));
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
-  computer.Forward();
+  computer.Run();
 
   this->ProcessOutputs(eg, &computer);
-  computer.Backward();
+  computer.Run();
 
   if (delta_nnet_ != NULL) {
     BaseFloat scale = (1.0 - nnet_config.momentum);
@@ -92,7 +90,7 @@ void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
       if (param_delta > nnet_config.max_param_change) {
         if (param_delta - param_delta != 0.0) {
           KALDI_WARN << "Infinite parameter change, will not apply.";
-          SetZero(false, delta_nnet_);
+          ScaleNnet(0.0, delta_nnet_);
         } else {
           scale *= nnet_config.max_param_change / param_delta;
           KALDI_LOG << "Parameter change too big: " << param_delta << " > "
@@ -126,7 +124,7 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
     CuMatrix<BaseFloat> nnet_output_deriv(nnet_output.NumRows(),
                                           nnet_output.NumCols(),
                                           kUndefined);
-    
+
     bool use_xent = (opts_.discriminative_config.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> xent_deriv;
@@ -140,14 +138,14 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
       objf_info_[sup.name].stats.Configure(opts_.discriminative_config);
       objf_info_[sup.name].stats.Reset();
     }
-    
-    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_config, 
+
+    ComputeDiscriminativeObjfAndDeriv(opts_.discriminative_config,
                                       tmodel_, log_priors_,
                                       sup.supervision, nnet_output,
-                                      &stats, 
+                                      &stats,
                                       &nnet_output_deriv,
                                       (use_xent ? &xent_deriv : NULL));
-    
+
     if (use_xent) {
       // this block computes the cross-entropy objective.
       const CuMatrixBase<BaseFloat> &xent_output = computer->GetOutput(xent_name);
@@ -175,23 +173,24 @@ void NnetDiscriminativeTrainer::ProcessOutputs(const NnetDiscriminativeExample &
         xent_deriv.MulRowsVec(cu_deriv_weights);
     }
 
-    computer->AcceptOutputDeriv(sup.name, &nnet_output_deriv);
+    computer->AcceptInput(sup.name, &nnet_output_deriv);
 
     objf_info_[sup.name].UpdateStats(sup.name, opts_.discriminative_config.criterion,
                                      opts_.nnet_config.print_interval,
                                      num_minibatches_processed_++,
                                      stats);
-    
+
     if (use_xent) {
       xent_deriv.Scale(opts_.discriminative_config.xent_regularize);
-      computer->AcceptOutputDeriv(xent_name, &xent_deriv);
+      computer->AcceptInput(xent_name, &xent_deriv);
     }
   }
 }
 
 
 bool NnetDiscriminativeTrainer::PrintTotalStats() const {
-  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo>::const_iterator
+  unordered_map<std::string, DiscriminativeObjectiveFunctionInfo,
+    StringHasher>::const_iterator
       iter = objf_info_.begin(),
       end = objf_info_.end();
   bool ans = false;
@@ -240,6 +239,19 @@ void DiscriminativeObjectiveFunctionInfo::PrintStatsForThisPhase(
 bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &name,
                 const std::string &criterion) const {
   BaseFloat objf = stats.TotalObjf(criterion) /stats.tot_t_weighted;
+
+  double avg_gradients = (stats.tot_num_count + stats.tot_den_count) /
+                         stats.tot_t_weighted;
+  KALDI_LOG << "Average num+den count of stats is " << avg_gradients
+              << " per frame, over "
+              << stats.tot_t_weighted << " frames.";
+  if (stats.tot_l2_term != 0.0) {
+    KALDI_LOG << "Average l2 norm of output per frame is "
+              << (stats.tot_l2_term / stats.tot_t_weighted) << " over "
+              << stats.tot_t_weighted << " frames.";
+  }
+
+
   KALDI_LOG << "Overall average objective function for '" << name << "' is "
             << objf << " over " << stats.tot_t_weighted << " frames.";
   KALDI_LOG << "[this line is to be parsed by a script:] "
@@ -251,11 +263,11 @@ bool DiscriminativeObjectiveFunctionInfo::PrintTotalStats(const std::string &nam
 
 NnetDiscriminativeTrainer::~NnetDiscriminativeTrainer() {
   delete delta_nnet_;
-  
+
   if (opts_.nnet_config.write_cache != "") {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
     compiler_.WriteCache(ko.Stream(), opts_.nnet_config.binary_write_cache);
-  } 
+  }
 }
 
 
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index 30f7840f6f8..088772bcba7 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -21,6 +21,9 @@
 #include "nnet3/nnet-example-utils.h"
 #include "lat/lattice-functions.h"
 #include "hmm/posterior.h"
+#include "util/text-utils.h"
+#include <numeric>
+#include <iomanip>
 
 namespace kaldi {
 namespace nnet3 {
@@ -282,7 +285,970 @@ void RoundUpNumFrames(int32 frame_subsampling_factor,
     KALDI_ERR << "--num-frames-overlap=" << (*num_frames_overlap) << " < "
               << "--num-frames=" << (*num_frames);
   }
+}
+
+void ExampleGenerationConfig::ComputeDerived() {
+  if (!SplitStringToIntegers(num_frames_str, ",", false, &num_frames) ||
+      num_frames.empty()) {
+    KALDI_ERR << "Invalid option (expected comma-separated list of integers): "
+              << "--num-frames=" << num_frames_str;
+  }
+
+  int32 m = frame_subsampling_factor;
+  if (m < 1) {
+    KALDI_ERR << "Invalid value --frame-subsampling-factor=" << m;
+  }
+  bool changed = false;
+  for (size_t i = 0; i < num_frames.size(); i++) {
+    int32 value = num_frames[i];
+    if (value <= 0) {
+      KALDI_ERR << "Invalid option --num-frames=" << num_frames_str;
+    }
+    if (value % m != 0) {
+      value = m * ((value / m) + 1);
+      changed = true;
+    }
+    num_frames[i] = value;
+  }
+  if (changed) {
+    std::ostringstream rounded_num_frames_str;
+    for (size_t i = 0; i < num_frames.size(); i++) {
+      if (i > 0)
+        rounded_num_frames_str << ',';
+      rounded_num_frames_str << num_frames[i];
+    }
+    KALDI_LOG << "Rounding up --num-frames=" << num_frames_str
+              << " to multiples of --frame-subsampling-factor=" << m
+              << ", to: " << rounded_num_frames_str.str();
+  }
+}
+
+
+UtteranceSplitter::UtteranceSplitter(const ExampleGenerationConfig &config):
+    config_(config),
+    total_num_utterances_(0), total_input_frames_(0),
+    total_frames_overlap_(0), total_num_chunks_(0),
+    total_frames_in_chunks_(0) {
+  if (config.num_frames.empty()) {
+    KALDI_ERR << "You need to call ComputeDerived() on the "
+                 "ExampleGenerationConfig().";
+  }
+  InitSplitForLength();
+}
+
+UtteranceSplitter::~UtteranceSplitter() {
+  KALDI_LOG << "Split " << total_num_utterances_ << " utts, with "
+            << "total length " << total_input_frames_ << " frames ("
+            << (total_input_frames_ / 360000.0) << " hours assuming "
+            << "100 frames per second)";
+  float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_,
+      overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_,
+      output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_,
+      output_percent_no_overlap = output_percent - overlap_percent;
+
+  KALDI_LOG << "Average chunk length was " << average_chunk_length
+            << " frames; overlap between adjacent chunks was "
+            << overlap_percent << "% of input length; length of output was "
+            << output_percent << "% of input length (minus overlap = "
+            << output_percent_no_overlap << "%).";
+  if (chunk_size_to_count_.size() > 1) {
+    std::ostringstream os;
+    os << std::setprecision(4);
+    for (std::map<int32, int32>::iterator iter = chunk_size_to_count_.begin();
+         iter != chunk_size_to_count_.end(); ++iter) {
+      int32 chunk_size = iter->first,
+          num_frames = chunk_size * iter->second;
+      float percent_of_total = num_frames * 100.0 / total_frames_in_chunks_;
+      if (iter != chunk_size_to_count_.begin()) os << ", ";
+      os << chunk_size << " = " << percent_of_total << "%";
+    }
+    KALDI_LOG << "Output frames are distributed among chunk-sizes as follows: "
+              << os.str();
+  }
+}
+
+float UtteranceSplitter::DefaultDurationOfSplit(
+    const std::vector<int32> &split) const {
+  if (split.empty())  // not a valid split, but useful to handle this case.
+    return 0.0;
+  float principal_num_frames = config_.num_frames[0],
+      num_frames_overlap = config_.num_frames_overlap;
+  KALDI_ASSERT(num_frames_overlap < principal_num_frames &&
+               "--num-frames-overlap value is too high");
+  float overlap_proportion = num_frames_overlap / principal_num_frames;
+  float ans = std::accumulate(split.begin(), split.end(), int32(0));
+  for (size_t i = 0; i + 1 < split.size(); i++) {
+    float min_adjacent_chunk_length = std::min(split[i], split[i + 1]),
+        overlap = overlap_proportion * min_adjacent_chunk_length;
+    ans -= overlap;
+  }
+  KALDI_ASSERT(ans > 0.0);
+  return ans;
+}
+
+/*
+  This comment describes the idea behind what InitChunkSize() is supposed to do,
+  and how it relates to the purpose of class UtteranceSplitter.
+
+  Class UtteranceSplitter is supposed to tell us, for a given utterance length,
+  what chunk sizes to use.  The chunk sizes it may choose are:
+    - zero or more chunks of the 'principal' size (the first-listed value in
+      --num-frames option)
+    - at most two chunks of 'alternative' num-frames (meaning, any but the
+      first-listed choice in the --num-frames option).
+
+  (note: an empty list of chunks is not allowed as a split).  A split is
+  a list of chunk-sizes in increasing order (we when we actually split the
+  utterance into chunks, we may, at random, reverse the order.
+
+  The choice of split to use for a given utterance-length is determined as
+  follows.  Firstly, for each split we compute a 'default duration' (see
+  DefaultDurationOfSplit()... if --num-frames-overlap is zero, this is just the
+  sum of the chunk sizes).  We then use by a cost-function that depends on
+  default-duration and the length of the utterance: the idea is that these two
+  should be as close as possible, but penalizing the default-duration being
+  larger than the utterance-length (which in the normal case of
+  --num-frames-overlap=0 would lead to gaps between the segments), twice as much
+  as the other sign of difference.
+
+  Specifically:
+    cost(default_duration, utt_length) = (default_duration > utt_length ?
+                                         default_duration - utt_length :
+                                         2.0 * (utt_length - default_duration))
+  [but as a special case, set c to infinity if the largest chunk size in the
+   split is longer than the utterance length; we couldn't, in that case, use
+   this split for this utterance].
+
+  We want to make sure a good variety of combinations of chunk sizes are chosen
+  in case there are ties from the cost function.  For each utterance length
+  we store the set of splits, whose costs are within 2
+  of the best cost available for that utterance length.  When asked to find
+  chunks for a particular utterance of that length, we will choose randomly
+  from that pool of splits.
+ */
+void UtteranceSplitter::InitSplitForLength() {
+  int32 max_utterance_length = MaxUtteranceLength();
+
+  // The 'splits' vector is a list of possible splits (a split being
+  // a sorted vector of chunk-sizes).
+  // The vector 'splits' is itself sorted.
+  std::vector<std::vector<int32> > splits;
+  InitSplits(&splits);
+
+
+  // Define a split-index 0 <= s < splits.size() as index into the 'splits'
+  // vector, and let a cost c >= 0 represent the mismatch between an
+  // utterance length and the total length of the chunk sizes in a split:
+
+  //  c(default_duration, utt_length) = (default_duration > utt_length ?
+  //                                    default_duration - utt_length :
+  //                                    2.0 * (utt_length - default_duration))
+  // [but as a special case, set c to infinity if the largest chunk size in the
+  //  split is longer than the utterance length; we couldn't, in that case, use
+  //  this split for this utterance].
+
+  // 'costs_for_length[u][s]', indexed by utterance-length u and then split,
+  // contains the cost for utterance-length u and split s.
+
+  std::vector<std::vector<float> > costs_for_length(
+      max_utterance_length + 1);
+  int32 num_splits = splits.size();
+
+  for (int32 u = 0; u <= max_utterance_length; u++)
+    costs_for_length[u].reserve(num_splits);
+
+  for (int32 s = 0; s < num_splits; s++) {
+    const std::vector<int32> &split = splits[s];
+    float default_duration = DefaultDurationOfSplit(split);
+    int32 max_chunk_size = *std::max_element(split.begin(), split.end());
+    for (int32 u = 0; u <= max_utterance_length; u++) {
+      // c is the cost for this utterance length and this split.  We penalize
+      // gaps twice as strongly as overlaps, based on the intuition that
+      // completely throwing out frames of data is worse than counting them
+      // twice.
+      float c = (default_duration > float(u) ? default_duration - float(u) :
+                 2.0 * (u - default_duration));
+      if (u < max_chunk_size)  // can't fit the largest of the chunks in this
+                               // utterance
+        c = std::numeric_limits<float>::max();
+      KALDI_ASSERT(c >= 0);
+      costs_for_length[u].push_back(c);
+    }
+  }
+
+
+  splits_for_length_.resize(max_utterance_length + 1);
+
+  for (int32 u = 0; u <= max_utterance_length; u++) {
+    const std::vector<float> &costs = costs_for_length[u];
+    float min_cost = *std::min_element(costs.begin(), costs.end());
+    if (min_cost == std::numeric_limits<float>::max()) {
+      // All costs were infinity, becaues this utterance-length u is shorter
+      // than the smallest chunk-size.  Leave splits_for_length_[u] as empty
+      // for this utterance-length, meaning we will not be able to choose any
+      // split, and such utterances will be discarded.
+      continue;
+    }
+    float cost_threshold = 1.9999; // We will choose pseudo-randomly from splits
+                                   // that are within this distance from the
+                                   // best cost.  Make the threshold just
+                                   // slightly less than 2...  this will
+                                   // hopefully make the behavior more
+                                   // deterministic for ties.
+    std::vector<int32> possible_splits;
+    std::vector<float>::const_iterator iter = costs.begin(), end = costs.end();
+    int32 s = 0;
+    for (; iter != end; ++iter,++s)
+      if (*iter < min_cost + cost_threshold)
+        splits_for_length_[u].push_back(splits[s]);
+  }
+
+  if (GetVerboseLevel() >= 3) {
+    std::ostringstream os;
+    for (int32 u = 0; u <= max_utterance_length; u++) {
+      if (!splits_for_length_[u].empty()) {
+        os << u << "=(";
+        std::vector<std::vector<int32 > >::const_iterator
+            iter1 = splits_for_length_[u].begin(),
+            end1 = splits_for_length_[u].end();
+
+        while (iter1 != end1) {
+          std::vector<int32>::const_iterator iter2 = iter1->begin(),
+              end2 = iter1->end();
+          while (iter2 != end2) {
+            os << *iter2;
+            ++iter2;
+            if (iter2 != end2) os << ",";
+          }
+          ++iter1;
+          if (iter1 != end1) os << "/";
+        }
+        os << ")";
+        if (u < max_utterance_length) os << ", ";
+      }
+    }
+    KALDI_VLOG(3) << "Utterance-length-to-splits map is: " << os.str();
+  }
+}
+
+
+bool UtteranceSplitter::LengthsMatch(const std::string &utt,
+                                     int32 utterance_length,
+                                     int32 supervision_length) const {
+  int32 sf = config_.frame_subsampling_factor,
+      expected_supervision_length = (utterance_length + sf - 1) / sf;
+  if (supervision_length == expected_supervision_length) {
+    return true;
+  } else {
+    if (sf == 1) {
+      KALDI_WARN << "Supervision does not have expected length for utterance "
+                 << utt << ": expected length = " << utterance_length
+                 << ", got " << supervision_length;
+    } else {
+      KALDI_WARN << "Supervision does not have expected length for utterance "
+                 << utt << ": expected length = (" << utterance_length
+                 << " + " << sf << " - 1) / " << sf << " = "
+                 << expected_supervision_length
+                 << ", got: " << supervision_length
+                 << " (note: --frame-subsampling-factor=" << sf << ")";
+    }
+    return false;
+  }
+}
+
+
+void UtteranceSplitter::GetChunkSizesForUtterance(
+    int32 utterance_length, std::vector<int32> *chunk_sizes) const {
+  KALDI_ASSERT(!splits_for_length_.empty());
+  // 'primary_length' is the first-specified num-frames.
+  // It's the only chunk that may be repeated an arbitrary number
+  // of times.
+  int32 primary_length = config_.num_frames[0],
+      num_frames_overlap = config_.num_frames_overlap,
+      max_tabulated_length = splits_for_length_.size() - 1,
+      num_primary_length_repeats = 0;
+  KALDI_ASSERT(primary_length - num_frames_overlap > 0);
+  KALDI_ASSERT(utterance_length >= 0);
+  while (utterance_length > max_tabulated_length) {
+    utterance_length -= (primary_length - num_frames_overlap);
+    num_primary_length_repeats++;
+  }
+  KALDI_ASSERT(utterance_length >= 0);
+  const std::vector<std::vector<int32> > &possible_splits =
+      splits_for_length_[utterance_length];
+  if (possible_splits.empty()) {
+    chunk_sizes->clear();
+    return;
+  }
+  int32 num_possible_splits = possible_splits.size(),
+      randomly_chosen_split = RandInt(0, num_possible_splits - 1);
+  *chunk_sizes = possible_splits[randomly_chosen_split];
+  for (int32 i = 0; i < num_primary_length_repeats; i++)
+    chunk_sizes->push_back(primary_length);
+
+  std::sort(chunk_sizes->begin(), chunk_sizes->end());
+  if (RandInt(0, 1) == 0) {
+    std::reverse(chunk_sizes->begin(), chunk_sizes->end());
+  }
+}
+
+
+int32 UtteranceSplitter::MaxUtteranceLength() const {
+  int32 num_lengths = config_.num_frames.size();
+  KALDI_ASSERT(num_lengths > 0);
+  // 'primary_length' is the first-specified num-frames.
+  // It's the only chunk that may be repeated an arbitrary number
+  // of times.
+  int32 primary_length = config_.num_frames[0],
+      max_length = primary_length;
+  for (int32 i = 0; i < num_lengths; i++) {
+    KALDI_ASSERT(config_.num_frames[i] > 0);
+    max_length = std::max(config_.num_frames[i], max_length);
+  }
+  return 2 * max_length + primary_length;
+}
+
+void UtteranceSplitter::InitSplits(std::vector<std::vector<int32> > *splits) const {
+  // we consider splits whose default duration (as returned by
+  // DefaultDurationOfSplit()) is up to MaxUtteranceLength() + primary_length.
+  // We can be confident without doing a lot of math, that splits above this
+  // length will never be chosen for any utterance-length up to
+  // MaxUtteranceLength() (which is the maximum we use).
+  int32 primary_length = config_.num_frames[0],
+      default_duration_ceiling = MaxUtteranceLength() + primary_length;
+
+  typedef unordered_set<std::vector<int32>, VectorHasher<int32> > SetType;
+
+  SetType splits_set;
+
+  int32 num_lengths = config_.num_frames.size();
+
+  // The splits we are allow are: zero to two 'alternate' lengths, plus
+  // an arbitrary number of repeats of the 'primary' length.  The repeats
+  // of the 'primary' length are handled by the inner loop over n.
+  // The zero to two 'alternate' lengths are handled by the loops over
+  // i and j.  i == 0 and j == 0 are special cases; they mean, no
+  // alternate is chosen.
+  for (int32 i = 0; i < num_lengths; i++) {
+    for (int32 j = 0; j < num_lengths; j++) {
+      std::vector<int32> vec;
+      if (i > 0)
+        vec.push_back(config_.num_frames[i]);
+      if (j > 0)
+        vec.push_back(config_.num_frames[j]);
+      int32 n = 0;
+      while (DefaultDurationOfSplit(vec) <= default_duration_ceiling) {
+        if (!vec.empty()) // Don't allow the empty vector as a split.
+          splits_set.insert(vec);
+        n++;
+        vec.push_back(primary_length);
+        std::sort(vec.begin(), vec.end());
+      }
+    }
+  }
+  for (SetType::const_iterator iter = splits_set.begin();
+       iter != splits_set.end(); ++iter)
+    splits->push_back(*iter);
+  std::sort(splits->begin(), splits->end());  // make the order deterministic,
+                                              // for consistency of output
+                                              // between runs and C libraries.
+}
+
+
+// static
+void UtteranceSplitter::DistributeRandomlyUniform(int32 n, std::vector<int32> *vec) {
+  KALDI_ASSERT(!vec->empty());
+  int32 size = vec->size();
+  if (n < 0) {
+    DistributeRandomlyUniform(-n, vec);
+    for (int32 i = 0; i < size; i++)
+      (*vec)[i] *= -1;
+    return;
+  }
+  // from this point we know n >= 0.
+  int32 common_part = n / size,
+      remainder = n % size, i;
+  for (i = 0; i < remainder; i++) {
+    (*vec)[i] = common_part + 1;
+  }
+  for (; i < size; i++) {
+    (*vec)[i] = common_part;
+  }
+  std::random_shuffle(vec->begin(), vec->end());
+  KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
+}
+
+
+// static
+void UtteranceSplitter::DistributeRandomly(int32 n,
+                                           const std::vector<int32> &magnitudes,
+                                           std::vector<int32> *vec) {
+  KALDI_ASSERT(!vec->empty() && vec->size() == magnitudes.size());
+  int32 size = vec->size();
+  if (n < 0) {
+    DistributeRandomly(-n, magnitudes, vec);
+    for (int32 i = 0; i < size; i++)
+      (*vec)[i] *= -1;
+    return;
+  }
+  float total_magnitude = std::accumulate(magnitudes.begin(), magnitudes.end(),
+                                          int32(0));
+  KALDI_ASSERT(total_magnitude > 0);
+  // note: 'partial_counts' contains the negative of the partial counts, so
+  // when we sort the larger partial counts come first.
+  std::vector<std::pair<float, int32> > partial_counts;
+  int32 total_count = 0;
+  for (int32 i = 0; i < size; i++) {
+    float this_count = n * float(magnitudes[i]) / total_magnitude;
+    // note: cast of float to int32 rounds towards zero (down, in this
+    // case, since this_count >= 0).
+    int32 this_whole_count = static_cast<int32>(this_count),
+        this_partial_count = this_count - this_whole_count;
+    (*vec)[i] = this_whole_count;
+    total_count += this_whole_count;
+    partial_counts.push_back(std::pair<float, int32>(-this_partial_count, i));
+  }
+  KALDI_ASSERT(total_count <= n && total_count + size >= n);
+  std::sort(partial_counts.begin(), partial_counts.end());
+  int32 i = 0;
+  // Increment by one the elements of the vector that has the largest partial
+  // count, then the next largest partial count, and so on... until we reach the
+  // desired total-count 'n'.
+  for(; total_count < n; i++,total_count++) {
+    (*vec)[partial_counts[i].second]++;
+  }
+  KALDI_ASSERT(std::accumulate(vec->begin(), vec->end(), int32(0)) == n);
+}
+
+
+void UtteranceSplitter::GetGapSizes(int32 utterance_length,
+                                    bool enforce_subsampling_factor,
+                                    const std::vector<int32> &chunk_sizes,
+                                    std::vector<int32> *gap_sizes) const {
+  if (chunk_sizes.empty()) {
+    gap_sizes->clear();
+    return;
+  }
+  if (enforce_subsampling_factor && config_.frame_subsampling_factor > 1) {
+    int32 sf = config_.frame_subsampling_factor, size = chunk_sizes.size();
+    int32 utterance_length_reduced = (utterance_length + (sf - 1)) / sf;
+    std::vector<int32> chunk_sizes_reduced(chunk_sizes);
+    for (int32 i = 0; i < size; i++) {
+      KALDI_ASSERT(chunk_sizes[i] % config_.frame_subsampling_factor == 0);
+      chunk_sizes_reduced[i] /= config_.frame_subsampling_factor;
+    }
+    GetGapSizes(utterance_length_reduced, false,
+                chunk_sizes_reduced, gap_sizes);
+    KALDI_ASSERT(gap_sizes->size() == static_cast<size_t>(size));
+    for (int32 i = 0; i < size; i++)
+      (*gap_sizes)[i] *= config_.frame_subsampling_factor;
+    return;
+  }
+  int32 num_chunks = chunk_sizes.size(),
+      total_of_chunk_sizes = std::accumulate(chunk_sizes.begin(),
+                                             chunk_sizes.end(),
+                                             int32(0)),
+      total_gap = utterance_length - total_of_chunk_sizes;
+  gap_sizes->resize(num_chunks);
+
+  if (total_gap < 0) {
+    // there is an overlap.  Overlaps can only go between chunks, not at the
+    // beginning or end of the utterance.  Also, we try to make the length of
+    // overlap proportional to the size of the smaller of the two chunks
+    // that the overlap is between.
+    if (num_chunks == 1) {
+      // there needs to be an overlap, but there is only one chunk... this means
+      // the chunk-size exceeds the utterance length, which is not allowed.
+      KALDI_ERR << "Chunk size is " << chunk_sizes[0]
+                << " but utterance length is only "
+                << utterance_length;
+    }
 
+    // note the elements of 'overlaps' will be <= 0.
+    std::vector<int32> magnitudes(num_chunks - 1),
+        overlaps(num_chunks - 1);
+    // the 'magnitudes' vector will contain the minimum of the lengths of the
+    // two adjacent chunks between which are are going to consider having an
+    // overlap.  These will be used to assign the overlap proportional to that
+    // size.
+    for (int32 i = 0; i + 1 < num_chunks; i++) {
+      magnitudes[i] = std::min<int32>(chunk_sizes[i], chunk_sizes[i + 1]);
+    }
+    DistributeRandomly(total_gap, magnitudes, &overlaps);
+    for (int32 i = 0; i + 1 < num_chunks; i++) {
+      // If the following condition does not hold, it's possible we
+      // could get chunk start-times less than zero.  I don't believe
+      // it's possible for this condition to fail, but we're checking
+      // for it at this level to make debugging easier, just in case.
+      KALDI_ASSERT(overlaps[i] <= magnitudes[i]);
+    }
+
+    (*gap_sizes)[0] = 0;  // no gap before 1st chunk.
+    for (int32 i = 1; i < num_chunks; i++)
+      (*gap_sizes)[i] = overlaps[i-1];
+  } else {
+    // There may be a gap.  Gaps can go at the start or end of the utterance, or
+    // between segments.  We try to distribute the gaps evenly.
+    std::vector<int32> gaps(num_chunks + 1);
+    DistributeRandomlyUniform(total_gap, &gaps);
+    // the last element of 'gaps', the one at the end of the utterance, is
+    // implicit and doesn't have to be written to the output.
+    for (int32 i = 0; i < num_chunks; i++)
+      (*gap_sizes)[i] = gaps[i];
+  }
+}
+
+
+void UtteranceSplitter::GetChunksForUtterance(
+    int32 utterance_length,
+    std::vector<ChunkTimeInfo> *chunk_info) {
+  std::vector<int32> chunk_sizes;
+  GetChunkSizesForUtterance(utterance_length, &chunk_sizes);
+  std::vector<int32> gaps(chunk_sizes.size());
+  GetGapSizes(utterance_length, true, chunk_sizes, &gaps);
+  int32 num_chunks = chunk_sizes.size();
+  chunk_info->resize(num_chunks);
+  int32 t = 0;
+  for (int32 i = 0; i < num_chunks; i++) {
+    t += gaps[i];
+    ChunkTimeInfo &info = (*chunk_info)[i];
+    info.first_frame = t;
+    info.num_frames = chunk_sizes[i];
+    info.left_context = (i == 0 && config_.left_context_initial >= 0 ?
+                         config_.left_context_initial : config_.left_context);
+    info.right_context = (i == 0 && config_.right_context_final >= 0 ?
+                          config_.right_context_final : config_.right_context);
+    t += chunk_sizes[i];
+  }
+  SetOutputWeights(utterance_length, chunk_info);
+  AccStatsForUtterance(utterance_length, *chunk_info);
+  // check that the end of the last chunk doesn't go more than
+  // 'config_.frame_subsampling_factor - 1' frames past the end
+  // of the utterance.  That amount, we treat as rounding error.
+  KALDI_ASSERT(t - utterance_length < config_.frame_subsampling_factor);
+}
+
+void UtteranceSplitter::AccStatsForUtterance(
+    int32 utterance_length,
+    const std::vector<ChunkTimeInfo> &chunk_info) {
+  total_num_utterances_ += 1;
+  total_input_frames_ += utterance_length;
+
+  for (size_t c = 0; c < chunk_info.size(); c++) {
+    int32 chunk_size = chunk_info[c].num_frames;
+    if (c > 0) {
+      int32 last_chunk_end = chunk_info[c-1].first_frame +
+          chunk_info[c-1].num_frames;
+      if (last_chunk_end > chunk_info[c].first_frame)
+        total_frames_overlap_ += last_chunk_end - chunk_info[c].first_frame;
+    }
+    std::map<int32, int32>::iterator iter = chunk_size_to_count_.find(
+        chunk_size);
+    if (iter == chunk_size_to_count_.end())
+      chunk_size_to_count_[chunk_size] = 1;
+    else
+      iter->second++;
+    total_num_chunks_ += 1;
+    total_frames_in_chunks_ += chunk_size;
+  }
+}
+
+
+void UtteranceSplitter::SetOutputWeights(
+    int32 utterance_length,
+    std::vector<ChunkTimeInfo> *chunk_info) const {
+  int32 sf = config_.frame_subsampling_factor;
+  int32 num_output_frames = (utterance_length + sf - 1) / sf;
+  // num_output_frames is the number of frames of supervision.  'count[t]' will
+  // be the number of chunks that this output-frame t appears in.  Note: the
+  // 'first_frame' and 'num_frames' members of ChunkTimeInfo will always be
+  // multiples of frame_subsampling_factor.
+  std::vector<int32> count(num_output_frames, 0);
+  int32 num_chunks = chunk_info->size();
+  for (int32 i = 0; i < num_chunks; i++) {
+    ChunkTimeInfo &chunk = (*chunk_info)[i];
+    for (int32 t = chunk.first_frame / sf;
+         t < (chunk.first_frame + chunk.num_frames) / sf;
+         t++)
+      count[t]++;
+  }
+  for (int32 i = 0; i < num_chunks; i++) {
+    ChunkTimeInfo &chunk = (*chunk_info)[i];
+    chunk.output_weights.resize(chunk.num_frames / sf);
+    int32 t_start = chunk.first_frame / sf;
+    for (int32 t = t_start;
+         t < (chunk.first_frame + chunk.num_frames) / sf;
+         t++)
+      chunk.output_weights[t - t_start] = 1.0 / count[t];
+  }
+}
+
+int32 ExampleMergingConfig::IntSet::LargestValueInRange(int32 max_value) const {
+  KALDI_ASSERT(!ranges.empty());
+  int32 ans = 0, num_ranges = ranges.size();
+  for (int32 i = 0; i < num_ranges; i++) {
+    int32 possible_ans = 0;
+    if (max_value >= ranges[i].first) {
+      if (max_value >= ranges[i].second)
+        possible_ans = ranges[i].second;
+      else
+        possible_ans = max_value;
+    }
+    if (possible_ans > ans)
+      ans = possible_ans;
+  }
+  return ans;
+}
+
+// static
+bool ExampleMergingConfig::ParseIntSet(const std::string &str,
+                                       ExampleMergingConfig::IntSet *int_set) {
+  std::vector<std::string> split_str;
+  SplitStringToVector(str, ",", false, &split_str);
+  if (split_str.empty())
+    return false;
+  int_set->largest_size = 0;
+  int_set->ranges.resize(split_str.size());
+  for (size_t i = 0; i < split_str.size(); i++) {
+    std::vector<int32> split_range;
+    SplitStringToIntegers(split_str[i], ":", false, &split_range);
+    if (split_range.size() < 1 || split_range.size() > 2 ||
+        split_range[0] > split_range.back() || split_range[0] <= 0)
+      return false;
+    int_set->ranges[i].first = split_range[0];
+    int_set->ranges[i].second = split_range.back();
+    int_set->largest_size = std::max<int32>(int_set->largest_size,
+                                            split_range.back());
+  }
+  return true;
+}
+
+void ExampleMergingConfig::ComputeDerived() {
+  if (measure_output_frames != "deprecated") {
+    KALDI_WARN << "The --measure-output-frames option is deprecated "
+        "and will be ignored.";
+  }
+  if (discard_partial_minibatches != "deprecated") {
+    KALDI_WARN << "The --discard-partial-minibatches option is deprecated "
+        "and will be ignored.";
+  }
+  std::vector<std::string> minibatch_size_split;
+  SplitStringToVector(minibatch_size, "/", false, &minibatch_size_split);
+  if (minibatch_size_split.empty()) {
+    KALDI_ERR << "Invalid option --minibatch-size=" << minibatch_size;
+  }
+
+  rules.resize(minibatch_size_split.size());
+  for (size_t i = 0; i < minibatch_size_split.size(); i++) {
+    int32 &eg_size = rules[i].first;
+    IntSet &int_set = rules[i].second;
+    // 'this_rule' will be either something like "256" or like "64-128,256"
+    // (but these two only if  minibatch_size_split.size() == 1, or something with
+    // an example-size specified, like "256=64-128,256"
+    std::string &this_rule = minibatch_size_split[i];
+    if (this_rule.find('=') != std::string::npos) {
+      std::vector<std::string> rule_split;  // split on '='
+      SplitStringToVector(this_rule, "=", false, &rule_split);
+      if (rule_split.size() != 2) {
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size;
+      }
+      if (!ConvertStringToInteger(rule_split[0], &eg_size) ||
+          !ParseIntSet(rule_split[1], &int_set))
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size;
+
+    } else {
+      if (minibatch_size_split.size() != 1) {
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size << " (all rules must have "
+                  << "eg-size specified if >1 rule)";
+      }
+      if (!ParseIntSet(this_rule, &int_set))
+        KALDI_ERR << "Could not parse option --minibatch-size="
+                  << minibatch_size;
+    }
+  }
+  {
+    // check that no size is repeated.
+    std::vector<int32> all_sizes(minibatch_size_split.size());
+    for (size_t i = 0; i < minibatch_size_split.size(); i++)
+      all_sizes[i] = rules[i].first;
+    std::sort(all_sizes.begin(), all_sizes.end());
+    if (!IsSortedAndUniq(all_sizes)) {
+      KALDI_ERR << "Invalid --minibatch-size=" << minibatch_size
+                << " (repeated example-sizes)";
+    }
+  }
+}
+
+int32 ExampleMergingConfig::MinibatchSize(int32 size_of_eg,
+                                          int32 num_available_egs,
+                                          bool input_ended) const {
+  KALDI_ASSERT(num_available_egs > 0 && size_of_eg > 0);
+  int32 num_rules = rules.size();
+  if (num_rules == 0)
+    KALDI_ERR << "You need to call ComputeDerived() before calling "
+        "MinibatchSize().";
+  int32 min_distance = std::numeric_limits<int32>::max(),
+      closest_rule_index = 0;
+  for (int32 i = 0; i < num_rules; i++) {
+    int32 distance = std::abs(size_of_eg - rules[i].first);
+    if (distance < min_distance) {
+      min_distance = distance;
+      closest_rule_index = i;
+    }
+  }
+  if (!input_ended) {
+    // until the input ends, we can only use the largest available
+    // minibatch-size (otherwise, we could expect more later).
+    int32 largest_size = rules[closest_rule_index].second.largest_size;
+    if (largest_size <= num_available_egs)
+      return largest_size;
+    else
+      return 0;
+  } else {
+    int32 s = rules[closest_rule_index].second.LargestValueInRange(
+        num_available_egs);
+    KALDI_ASSERT(s <= num_available_egs);
+    return s;
+  }
+}
+
+
+void ExampleMergingStats::WroteExample(int32 example_size,
+                                    size_t structure_hash,
+                                    int32 minibatch_size) {
+  std::pair<int32, size_t> p(example_size, structure_hash);
+
+
+  unordered_map<int32, int32> &h = stats_[p].minibatch_to_num_written;
+  unordered_map<int32, int32>::iterator iter = h.find(minibatch_size);
+  if (iter == h.end())
+    h[minibatch_size] = 1;
+  else
+    iter->second += 1;
+}
+
+void ExampleMergingStats::DiscardedExamples(int32 example_size,
+                                         size_t structure_hash,
+                                         int32 num_discarded) {
+  std::pair<int32, size_t> p(example_size, structure_hash);
+  stats_[p].num_discarded += num_discarded;
+}
+
+
+void ExampleMergingStats::PrintStats() const {
+  PrintSpecificStats();
+  PrintAggregateStats();
+}
+
+void ExampleMergingStats::PrintAggregateStats() const {
+  // First print some aggregate stats.
+  int64 num_distinct_egs_types = 0,  // number of distinct types of input egs
+                                     // (differing in size or structure).
+      total_discarded_egs = 0, // total number of discarded egs.
+      total_discarded_egs_size = 0, // total number of discarded egs each multiplied by size
+                                    // of that eg
+      total_non_discarded_egs = 0,  // total over all minibatches written, of
+                                    // minibatch-size, equals number of input egs
+                                    // that were not discarded.
+      total_non_discarded_egs_size = 0,  // total over all minibatches of size-of-eg
+                                     // * minibatch-size.
+      num_minibatches = 0,  // total number of minibatches
+      num_distinct_minibatch_types = 0;  // total number of combination of
+                                         // (type-of-eg, number of distinct
+                                         // minibatch-sizes for that eg-type)-
+                                         // reflects the number of time we have
+                                         // to compile.
+
+  StatsType::const_iterator eg_iter = stats_.begin(), eg_end = stats_.end();
+
+  for (; eg_iter != eg_end; ++eg_iter) {
+    int32 eg_size = eg_iter->first.first;
+    const StatsForExampleSize &stats = eg_iter->second;
+    num_distinct_egs_types++;
+    total_discarded_egs += stats.num_discarded;
+    total_discarded_egs_size += stats.num_discarded * eg_size;
+
+    unordered_map<int32, int32>::const_iterator
+        mb_iter = stats.minibatch_to_num_written.begin(),
+        mb_end = stats.minibatch_to_num_written.end();
+    for (; mb_iter != mb_end; ++mb_iter) {
+      int32 mb_size = mb_iter->first,
+          num_written = mb_iter->second;
+      num_distinct_minibatch_types++;
+      num_minibatches += num_written;
+      total_non_discarded_egs += num_written * mb_size;
+      total_non_discarded_egs_size += num_written * mb_size * eg_size;
+    }
+  }
+  // the averages are written as integers- we don't really need more precision
+  // than that.
+  int64 total_input_egs = total_discarded_egs + total_non_discarded_egs,
+      total_input_egs_size =
+      total_discarded_egs_size + total_non_discarded_egs_size;
+
+  float avg_input_egs_size = total_input_egs_size * 1.0 / total_input_egs;
+  float percent_discarded = total_discarded_egs * 100.0 / total_input_egs;
+  // note: by minibatch size we mean the number of egs per minibatch, it
+  // does not take note of the size of the input egs.
+  float avg_minibatch_size = total_non_discarded_egs * 1.0 / num_minibatches;
+
+  std::ostringstream os;
+  os << std::setprecision(4);
+  os << "Processed " << total_input_egs
+     << " egs of avg. size " << avg_input_egs_size
+     << " into " << num_minibatches << " minibatches, discarding "
+     << percent_discarded <<  "% of egs.  Avg minibatch size was "
+     << avg_minibatch_size << ", #distinct types of egs/minibatches "
+     << "was " << num_distinct_egs_types << "/"
+     << num_distinct_minibatch_types;
+  KALDI_LOG << os.str();
+}
+
+void ExampleMergingStats::PrintSpecificStats() const {
+  KALDI_LOG << "Merged specific eg types as follows [format: <eg-size1>="
+      "{<mb-size1>-><num-minibatches1>,<mbsize2>-><num-minibatches2>.../d=<num-discarded>}"
+      ",<egs-size2>={...},... (note,egs-size == number of input "
+      "frames including context).";
+  std::ostringstream os;
+
+  // copy from unordered map to map to get sorting, for consistent output.
+  typedef std::map<std::pair<int32, size_t>, StatsForExampleSize> SortedMapType;
+
+  SortedMapType stats;
+  stats.insert(stats_.begin(), stats_.end());
+  SortedMapType::const_iterator eg_iter = stats.begin(), eg_end = stats.end();
+  for (; eg_iter != eg_end; ++eg_iter) {
+    int32 eg_size = eg_iter->first.first;
+    if (eg_iter != stats.begin())
+      os << ",";
+    os << eg_size << "={";
+    const StatsForExampleSize &stats = eg_iter->second;
+    unordered_map<int32, int32>::const_iterator
+        mb_iter = stats.minibatch_to_num_written.begin(),
+        mb_end =  stats.minibatch_to_num_written.end();
+    for (; mb_iter != mb_end; ++mb_iter) {
+      int32 mb_size = mb_iter->first,
+          num_written = mb_iter->second;
+      if (mb_iter != stats.minibatch_to_num_written.begin())
+        os << ",";
+      os << mb_size << "->" << num_written;
+    }
+    os << ",d=" << stats.num_discarded << "}";
+  }
+  KALDI_LOG << os.str();
+}
+
+
+
+int32 GetNnetExampleSize(const NnetExample &a) {
+  int32 ans = 0;
+  for (size_t i = 0; i < a.io.size(); i++) {
+    int32 s = a.io[i].indexes.size();
+    if (s > ans)
+      ans = s;
+  }
+  return ans;
+}
+
+ExampleMerger::ExampleMerger(const ExampleMergingConfig &config,
+                             NnetExampleWriter *writer):
+    finished_(false), num_egs_written_(0),
+    config_(config), writer_(writer) { }
+
+
+void ExampleMerger::AcceptExample(NnetExample *eg) {
+  KALDI_ASSERT(!finished_);
+  // If an eg with the same structure as 'eg' is already a key in the
+  // map, it won't be replaced, but if it's new it will be made
+  // the key.  Also we remove the key before making the vector empty.
+  // This way we ensure that the eg in the key is always the first
+  // element of the vector.
+  std::vector<NnetExample*> &vec = eg_to_egs_[eg];
+  vec.push_back(eg);
+  int32 eg_size = GetNnetExampleSize(*eg),
+      num_available = vec.size();
+  bool input_ended = false;
+  int32 minibatch_size = config_.MinibatchSize(eg_size, num_available,
+                                               input_ended);
+  if (minibatch_size != 0) {  // we need to write out a merged eg.
+    KALDI_ASSERT(minibatch_size == num_available);
+
+    std::vector<NnetExample*> vec_copy(vec);
+    eg_to_egs_.erase(eg);
+
+    // MergeExamples() expects a vector of NnetExample, not of pointers,
+    // so use swap to create that without doing any real work.
+    std::vector<NnetExample> egs_to_merge(minibatch_size);
+    for (int32 i = 0; i < minibatch_size; i++) {
+      egs_to_merge[i].Swap(vec_copy[i]);
+      delete vec_copy[i];  // we owned those pointers.
+    }
+    WriteMinibatch(egs_to_merge);
+  }
+}
+
+void ExampleMerger::WriteMinibatch(const std::vector<NnetExample> &egs) {
+  KALDI_ASSERT(!egs.empty());
+  int32 eg_size = GetNnetExampleSize(egs[0]);
+  NnetExampleStructureHasher eg_hasher;
+  size_t structure_hash = eg_hasher(egs[0]);
+  int32 minibatch_size = egs.size();
+  stats_.WroteExample(eg_size, structure_hash, minibatch_size);
+  NnetExample merged_eg;
+  MergeExamples(egs, config_.compress, &merged_eg);
+  std::ostringstream key;
+  key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  writer_->Write(key.str(), merged_eg);
+}
+
+void ExampleMerger::Finish() {
+  if (finished_) return;  // already finished.
+  finished_ = true;
+
+  // we'll convert the map eg_to_egs_ to a vector of vectors to avoid
+  // iterator invalidation problems.
+  std::vector<std::vector<NnetExample*> > all_egs;
+  all_egs.reserve(eg_to_egs_.size());
+
+  MapType::iterator iter = eg_to_egs_.begin(), end = eg_to_egs_.end();
+  for (; iter != end; ++iter)
+    all_egs.push_back(iter->second);
+  eg_to_egs_.clear();
+
+  for (size_t i = 0; i < all_egs.size(); i++) {
+    int32 minibatch_size;
+    std::vector<NnetExample*> &vec = all_egs[i];
+    KALDI_ASSERT(!vec.empty());
+    int32 eg_size = GetNnetExampleSize(*(vec[0]));
+    bool input_ended = true;
+    while (!vec.empty() &&
+           (minibatch_size = config_.MinibatchSize(eg_size, vec.size(),
+                                                   input_ended)) != 0) {
+      // MergeExamples() expects a vector of NnetExample, not of pointers,
+      // so use swap to create that without doing any real work.
+      std::vector<NnetExample> egs_to_merge(minibatch_size);
+      for (int32 i = 0; i < minibatch_size; i++) {
+        egs_to_merge[i].Swap(vec[i]);
+        delete vec[i];  // we owned those pointers.
+      }
+      vec.erase(vec.begin(), vec.begin() + minibatch_size);
+      WriteMinibatch(egs_to_merge);
+    }
+    if (!vec.empty()) {
+      int32 eg_size = GetNnetExampleSize(*(vec[0]));
+      NnetExampleStructureHasher eg_hasher;
+      size_t structure_hash = eg_hasher(*(vec[0]));
+      int32 num_discarded = vec.size();
+      stats_.DiscardedExamples(eg_size, structure_hash, num_discarded);
+      for (int32 i = 0; i < num_discarded; i++)
+        delete vec[i];
+      vec.clear();
+    }
+  }
+  stats_.PrintStats();
 }
 
 
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 3e309e18915..debd93599e9 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -23,6 +23,7 @@
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-computation.h"
 #include "nnet3/nnet-compute.h"
+#include "util/kaldi-table.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -51,7 +52,7 @@ void ShiftExampleTimes(int32 t_offset,
 
 /**  This function takes a NnetExample (which should already have been
      frame-selected, if desired, and merged into a minibatch) and produces a
-     ComputationRequest.  It ssumes you don't want the derivatives w.r.t. the
+     ComputationRequest.  It assumes you don't want the derivatives w.r.t. the
      inputs; if you do, you can create/modify the ComputationRequest manually.
      Assumes that if need_model_derivative is true, you will be supplying
      derivatives w.r.t. all outputs.
@@ -71,14 +72,449 @@ void WriteVectorAsChar(std::ostream &os,
 
 // Reads data written by WriteVectorAsChar.
 void ReadVectorAsChar(std::istream &is,
-                             bool binary,
-                             Vector<BaseFloat> *vec);
-
-// This function rounds up the quantities 'num_frames' and 'num_frames_overlap'
-// to the nearest multiple of the frame_subsampling_factor
-void RoundUpNumFrames(int32 frame_subsampling_factor,
-                      int32 *num_frames,
-                      int32 *num_frames_overlap);
+                      bool binary,
+                      Vector<BaseFloat> *vec);
+
+
+// Warning: after reading in the values from the command line
+// (Register() and then then po.Read()), you should then call ComputeDerived()
+// to set up the 'derived values' (parses 'num_frames_str').
+struct ExampleGenerationConfig {
+  int32 left_context;
+  int32 right_context;
+  int32 left_context_initial;
+  int32 right_context_final;
+  int32 num_frames_overlap;
+  int32 frame_subsampling_factor;
+  std::string num_frames_str;
+
+
+  // The following parameters are derived parameters, computed by
+  // ComputeDerived().
+
+  // the first element of the 'num_frames' vector is the 'principal' number of
+  // frames; the remaining elements are alternatives to the principal number of
+  // frames, to be used at most once or twice per file.
+  std::vector<int32> num_frames;
+
+  ExampleGenerationConfig():
+      left_context(0), right_context(0),
+      left_context_initial(-1), right_context_final(-1),
+      num_frames_overlap(0), frame_subsampling_factor(1),
+      num_frames_str("1") { }
+
+  /// This function decodes 'num_frames_str' into 'num_frames', and ensures that
+  /// the members of 'num_frames' are multiples of 'frame_subsampling_factor'.
+  void ComputeDerived();
+
+  void Register(OptionsItf *po) {
+    po->Register("left-context", &left_context, "Number of frames of left "
+                 "context of input features that are added to each "
+                 "example");
+    po->Register("right-context", &right_context, "Number of frames of right "
+                 "context of input features that are added to each "
+                 "example");
+    po->Register("left-context-initial", &left_context_initial, "Number of "
+                 "frames of left context of input features that are added to "
+                 "each example at the start of the utterance (if <0, this "
+                 "defaults to the same as --left-context)");
+    po->Register("right-context-final", &right_context_final, "Number of "
+                 "frames of right context of input features that are added "
+                 "to each example at the end of the utterance (if <0, this "
+                 "defaults to the same as --right-context)");
+    po->Register("num-frames", &num_frames_str, "Number of frames with labels "
+                "that each example contains (i.e. the left and right context "
+                "are to be added to this).  May just be an integer (e.g. "
+                "--num-frames=8), or an principal value followed by "
+                "alternative values to be used at most once for each utterance "
+                "to deal with odd-sized input, e.g. --num-frames=40,25,50 means "
+                "that most of the time the number of frames will be 40, but to "
+                "deal with odd-sized inputs we may also generate egs with these "
+                "other sizes.  All these values will be rounded up to the "
+                "closest multiple of --frame-subsampling-factor.");
+    po->Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
+                 "overlap between adjacent eamples (applies to chunks of size "
+                 "equal to the primary [first-listed] --num-frames value... "
+                 "will be adjusted for different-sized chunks).  Advisory; "
+                 "will not be exactly enforced.");
+    po->Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
+                 "if the frame-rate of the output labels in the generated "
+                 "examples will be less than the frame-rate at the input");
+  }
+};
+
+
+
+/**
+   struct ChunkTimeInfo is used by class UtteranceSplitter to output
+   information about how we split an utterance into chunks.
+ */
+
+struct ChunkTimeInfo {
+  int32 first_frame;
+  int32 num_frames;
+  int32 left_context;
+  int32 right_context;
+  // The 'output_weights' member is a vector of length equal to the
+  // num_frames divided by frame_subsampling_factor from the config.
+  // It contains values 0 < x <= 1 that represent weightings of
+  // output-frames.  The idea is that if (because of overlaps) a
+  // frame appears in multiple chunks, we want to downweight it
+  // so that the total weight remains 1.  (Of course, the calling
+  // code is free to ignore these weights if desired).
+  std::vector<BaseFloat> output_weights;
+};
+
+
+class UtteranceSplitter {
+ public:
+
+  UtteranceSplitter(const ExampleGenerationConfig &config);
+
+
+  const ExampleGenerationConfig& Config() const { return config_; }
+
+  // Given an utterance length, this function creates for you a list of chunks
+  // into which to split the utterance.  Note: this is partly random (will call
+  // srand()).
+  // Accumulates some stats which will be printed out in the destructor.
+  void GetChunksForUtterance(int32 utterance_length,
+                             std::vector<ChunkTimeInfo> *chunk_info);
+
+
+  // This function returns true if 'supervision_length' (e.g. the length of the
+  // posterior, lattice or alignment) is what we expect given
+  // config_.frame_subsampling_factor.  If not, it prints a warning (which is
+  // why the function needs 'utt', and returns false.  Note: we round up, so
+  // writing config_.frame_subsampling_factor as sf, we expect
+  // supervision_length = (utterance_length + sf - 1) / sf.
+  bool LengthsMatch(const std::string &utt,
+                    int32 utterance_length,
+                    int32 supervision_length) const;
+
+  ~UtteranceSplitter();
+
+  int32 ExitStatus() { return (total_frames_in_chunks_ > 0 ? 0 : 1); }
+
+ private:
+
+
+  void InitSplitForLength();
+
+  // This function returns the 'default duration' in frames of a split, which if
+  // config_.num_frames_overlap is zero is just the sum of chunk sizes in the
+  // split (i.e. the sum of the vector's elements), but otherwise, we subtract
+  // the recommended overlap (see code for details).
+  float DefaultDurationOfSplit(const std::vector<int32> &split) const;
+
+
+  // Used in InitSplitForLength(), returns the maximum utterance-length considered
+  // separately in split_for_length_.  [above this, we'll assume that the additional
+  // length is consumed by multiples of the 'principal' chunk size.]  It returns
+  // the primary chunk-size (config_.num_frames[0]) plus twice the largest of
+  // any of the allowed chunk sizes (i.e. the max of config_.num_frames)
+  int32 MaxUtteranceLength() const;
+
+  // Used in InitSplitForLength(), this function outputs the set of allowed
+  // splits, represented as a sorted list of nonempty vectors (each split is a
+  // sorted list of chunk-sizes).
+  void InitSplits(std::vector<std::vector<int32> > *splits) const;
+
+
+  // Used in GetChunksForUtterance, this function selects the list of
+  // chunk-sizes for that utterance (later on, the positions and and left/right
+  // context information for the chunks will be added to this).  We don't call
+  // this a 'split', although it's also a list of chunk-sizes, because we
+  // randomize the order in which the chunk sizes appear, whereas for a 'split'
+  // we sort the chunk-sizes because a 'split' is conceptually an
+  // order-independent representation.
+  void GetChunkSizesForUtterance(int32 utterance_length,
+                                 std::vector<int32> *chunk_sizes) const;
+
+
+  // Used in GetChunksForUtterance, this function selects the 'gap sizes'
+  // before each of the chunks.  These 'gap sizes' may be positive (representing
+  // a gap between chunks, or a number of frames at the beginning of the file that
+  // don't correspond to a chunk), or may be negative, corresponding to overlaps
+  // between adjacent chunks.
+  //
+  // If config_.frame_subsampling_factor > 1 and enforce_subsampling_factor is
+  // true, this function will ensure that all elements of 'gap_sizes' are
+  // multiples of config_.frame_subsampling_factor.  (we always enforce this,
+  // but we set it to false inside a recursion when we recurse).  Note: if
+  // config_.frame_subsampling_factor > 1, it's possible for the last chunk to
+  // go over 'utterance_length' by up to config_.frame_subsampling_factor - 1
+  // frames (i.e. it would require that many frames past the utterance end).
+  // This will be dealt with when generating egs, by duplicating the last frame.
+  void GetGapSizes(int32 utterance_length,
+                   bool enforce_subsampling_factor,
+                   const std::vector<int32> &chunk_sizes,
+                   std::vector<int32> *gap_sizes) const;
+
+  // this static function, used in GetGapSizes(), writes random values to a
+  // vector 'vec' such the sum of those values equals n (n may be positive or
+  // negative).  It tries to make those values as similar as possible (they will
+  // differ by at most one), and the location of the larger versus smaller
+  // values is random. 'vec' must be nonempty.
+  static void DistributeRandomlyUniform(int32 n,
+                                        std::vector<int32> *vec);
+
+  // this static function, used in GetGapSizes(), writes values to a vector
+  // 'vec' such the sum of those values equals n (n may be positive or
+  // negative).  It tries to make those values, as exactly as it can,
+  // proportional to the values in 'magnitudes', which must be positive.  'vec'
+  // must be nonempty, and 'magnitudes' must be the same size as 'vec'.
+  static void DistributeRandomly(int32 n,
+                                 const std::vector<int32> &magnitudes,
+                                 std::vector<int32> *vec);
+
+  // This function is responsible for setting the 'output_weights'
+  // members of the chunks.
+  void SetOutputWeights(int32 utterance_length,
+                        std::vector<ChunkTimeInfo> *chunk_info) const;
+
+  // Accumulate stats for diagnostics.
+  void AccStatsForUtterance(int32 utterance_length,
+                            const std::vector<ChunkTimeInfo> &chunk_info);
+
+
+  const ExampleGenerationConfig &config_;
+
+  // The vector 'splits_for_length_' is indexed by the num-frames of a file, and
+  // gives us a list of alternative splits that we can use if the utternace has
+  // that many frames.  For example, if split_for_length[100] = ( (25, 40, 40),
+  // (40, 65) ), it means we could either split as chunks of size (25, 40, 40)
+  // or as (40, 65).  (we'll later randomize the order).  should use one chunk
+  // of size 25 and two chunks of size 40.  In general these won't add up to
+  // exactly the length of the utterance; we'll have them overlap (or have small
+  // gaps between them) to account for this, and the details of this will be
+  // randomly decided per file.  If splits_for_length_[u] is empty, it means the
+  // utterance was shorter than the smallest possible chunk size, so
+  // we will have to discard the utterance.
+
+  // If an utterance's num-frames is >= split_for_length.size(), the way to find
+  // the split to use is to keep subtracting the primary num-frames (==
+  // config_.num_frames[0]) minus the num-frames-overlap, from the utterance
+  // length, until the resulting num-frames is < split_for_length_.size(),
+  // chunks, and then add the subtracted number of copies of the primary
+  // num-frames to the split.
+  std::vector<std::vector<std::vector<int32> > > splits_for_length_;
+
+  // Below are stats used for diagnostics.
+  int32 total_num_utterances_;  // total input utterances.
+  int64 total_input_frames_;  // total num-frames over all utterances (before
+                              // splitting)
+  int64 total_frames_overlap_;  // total number of frames that overlap between
+                                // adjacent egs.
+  int64 total_num_chunks_;
+  int64 total_frames_in_chunks_;  // total of chunk-size times count of that
+                                  // chunk.  equals the num-frames in all the
+                                  // output chunks, added up.
+  std::map<int32, int32> chunk_size_to_count_;  // for each chunk size, gives
+                                                // the number of chunks with
+                                                // that size.
+
+};
+
+
+class ExampleMergingConfig {
+public:
+  // The following configuration values are registered on the command line.
+  bool compress;
+  std::string measure_output_frames;  // for back-compatibility, not used.
+  std::string minibatch_size;
+  std::string discard_partial_minibatches;   // for back-compatibility, not used.
+
+  ExampleMergingConfig(const char *default_minibatch_size = "256"):
+      compress(false),
+      measure_output_frames("deprecated"),
+      minibatch_size(default_minibatch_size),
+      discard_partial_minibatches("deprecated") { }
+
+  void Register(OptionsItf *po) {
+    po->Register("compress", &compress, "If true, compress the output examples "
+                 "(not recommended unless you are writing to disk)");
+    po->Register("measure-output-frames", &measure_output_frames, "This "
+                 "value will be ignored (included for back-compatibility)");
+    po->Register("discard-partial-minibatches", &discard_partial_minibatches,
+                 "This value will be ignored (included for back-compatibility)");
+    po->Register("minibatch-size", &minibatch_size,
+                 "String controlling the minibatch size.  May be just an integer, "
+                 "meaning a fixed minibatch size (e.g. --minibatch-size=128). "
+                 "May be a list of ranges and values, e.g. --minibatch-size=32,64 "
+                 "or --minibatch-size=16:32,64,128.  All minibatches will be of "
+                 "the largest size until the end of the input is reached; "
+                 "then, increasingly smaller sizes will be allowed.  Only egs "
+                 "with the same structure (e.g num-frames) are merged.  You may "
+                 "specify different minibatch sizes for different sizes of eg "
+                 "(defined as the maximum number of Indexes on any input), in "
+                 "the format "
+                 "--minibatch-size='eg_size1=mb_sizes1/eg_size2=mb_sizes2', e.g. "
+                 "--minibatch-size=128=64:128,256/256=32:64,128.  Egs are given "
+                 "minibatch-sizes based on the specified eg-size closest to "
+                 "their actual size.");
+  }
+
+
+  // this function computes the derived (private) parameters; it must be called
+  // after the command-line parameters are read and before MinibatchSize() is
+  // called.
+  void ComputeDerived();
+
+  /// This function tells you what minibatch size should be used for this eg.
+
+  ///  @param [in] size_of_eg   The "size" of the eg, as obtained by
+  ///                           GetNnetExampleSize() or a similar function (up
+  ///                           to the caller).
+  ///  @param [in] num_available_egs   The number of egs of this size that are
+  ///                            currently available; should be >0.  The
+  ///                            value returned will be <= this value, possibly
+  ///                            zero.
+  ///  @param [in] input_ended   True if the input has ended, false otherwise.
+  ///                            This is important because before the input has
+  ///                            ended, we will only batch egs into the largest
+  ///                            possible minibatch size among the range allowed
+  ///                            for that size of eg.
+  ///  @return                   Returns the minibatch size to use in this
+  ///                            situation, as specified by the configuration.
+  int32 MinibatchSize(int32 size_of_eg,
+                      int32 num_available_egs,
+                      bool input_ended) const;
+
+
+ private:
+  // struct IntSet is a representation of something like 16:32,64, which is a
+  // nonempty list of either positive integers or ranges of positive integers.
+  // Conceptually it represents a set of positive integers.
+  struct IntSet {
+    // largest_size is the largest integer in any of the ranges (64 in this
+    // example).
+    int32 largest_size;
+    // e.g. would contain ((16,32), (64,64)) in this example.
+    std::vector<std::pair<int32, int32> > ranges;
+    // Returns the largest value in any range (i.e. in the set of
+    // integers that this struct represents), that is <= max_value,
+    // or 0 if there is no value in any range that is <= max_value.
+    // In this example, this function would return the following:
+    // 128->64, 64->64, 63->32, 31->31, 16->16, 15->0, 0->0
+    int32 LargestValueInRange(int32 max_value) const;
+  };
+  static bool ParseIntSet(const std::string &str, IntSet *int_set);
+
+  // 'rules' is derived from the configuration values above by ComputeDerived(),
+  // and are not set directly on the command line.  'rules' is a list of pairs
+  // (eg-size, int-set-of-minibatch-sizes); If no explicit eg-sizes were
+  // specified on the command line (i.e. there was no '=' sign in the
+  // --minibatch-size option), then we just set the int32 to 0.
+  std::vector<std::pair<int32, IntSet> > rules;
+};
+
+
+/// This function returns the 'size' of a nnet-example as defined for purposes
+/// of merging egs, which is defined as the largest number of Indexes in any of
+/// the inputs or outputs of the example.
+int32 GetNnetExampleSize(const NnetExample &a);
+
+
+
+
+
+/// This class is responsible for storing, and displaying in log messages,
+/// statistics about how examples of different sizes (c.f. GetNnetExampleSize())
+/// were merged into minibatches, and how many examples were left over and
+/// discarded.
+class ExampleMergingStats {
+ public:
+  /// Users call this function to inform this class that one minibatch has been
+  /// written aggregating 'minibatch_size' separate examples of original size
+  /// 'example_size' (e.g. as determined by GetNnetExampleSize(), but the caller
+  /// does that.
+  /// The 'structure_hash' is provided so that this class can distinguish
+  /// between egs that have the same size but different structure.  In the
+  /// extremely unlikely eventuality that there is a hash collision, it will
+  /// cause misleading stats to be printed out.
+  void WroteExample(int32 example_size, size_t structure_hash,
+                    int32 minibatch_size);
+
+  /// Users call this function to inform this class that after processing all
+  /// the data, for examples of original size 'example_size', 'num_discarded'
+  /// examples could not be put into a minibatch and were discarded.
+  void DiscardedExamples(int32 example_size, size_t structure_hash,
+                         int32 num_discarded);
+
+  /// Calling this will cause a log message with information about the
+  /// examples to be printed.
+  void PrintStats() const;
+
+ private:
+  // this struct stores the stats for examples of a particular size and
+  // structure.
+  struct StatsForExampleSize {
+    int32 num_discarded;
+    // maps from minibatch-size (i.e. number of egs that were
+    // aggregated into that minibatch), to the number of such
+    // minibatches written.
+    unordered_map<int32, int32> minibatch_to_num_written;
+    StatsForExampleSize(): num_discarded(0) { }
+  };
+
+
+  typedef unordered_map<std::pair<int32, size_t>, StatsForExampleSize,
+                        PairHasher<int32, size_t> > StatsType;
+
+  // this maps from a pair (example_size, structure_hash) to to the stats for
+  // examples with those characteristics.
+  StatsType stats_;
+
+  void PrintAggregateStats() const;
+  void PrintSpecificStats() const;
+
+};
+
+
+/// This class is responsible for arranging examples in groups
+/// that have the same strucure (i.e. the same input and output
+/// indexes), and outputting them in suitable minibatches
+/// as defined by ExampleMergingConfig.
+class ExampleMerger {
+ public:
+  ExampleMerger(const ExampleMergingConfig &config,
+                NnetExampleWriter *writer);
+
+  // This function accepts an example, and if possible, writes a merged example
+  // out.  The ownership of the pointer 'a' is transferred to this class when
+  // you call this function.
+  void AcceptExample(NnetExample *a);
+
+  // This function announces to the class that the input has finished, so it
+  // should flush out any smaller-sized minibatches, as dictated by the config.
+  // This will be called in the destructor, but you can call it explicitly when
+  // all the input is done if you want to; it won't repeat anything if called
+  // twice.  It also prints the stats.
+  void Finish();
+
+  // returns a suitable exit status for a program.
+  int32 ExitStatus() { Finish(); return (num_egs_written_ > 0 ? 0 : 1); }
+
+  ~ExampleMerger() { Finish(); };
+ private:
+  // called by Finish() and AcceptExample().  Merges, updates the
+  // stats, and writes.
+  void WriteMinibatch(const std::vector<NnetExample> &egs);
+
+  bool finished_;
+  int32 num_egs_written_;
+  const ExampleMergingConfig &config_;
+  NnetExampleWriter *writer_;
+  ExampleMergingStats stats_;
+
+  // Note: the "key" into the egs is the first element of the vector.
+  typedef unordered_map<NnetExample*, std::vector<NnetExample*>,
+                        NnetExampleStructureHasher,
+                        NnetExampleStructureCompare> MapType;
+   MapType eg_to_egs_;
+};
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-example.cc b/src/nnet3/nnet-example.cc
index 9a34258e0ee..c011f2a0b8a 100644
--- a/src/nnet3/nnet-example.cc
+++ b/src/nnet3/nnet-example.cc
@@ -122,5 +122,53 @@ void NnetExample::Compress() {
     iter->features.Compress();
 }
 
+
+size_t NnetIoStructureHasher::operator () (
+    const NnetIo &io) const noexcept {
+  StringHasher string_hasher;
+  IndexVectorHasher indexes_hasher;
+
+  // numbers appearing here were taken at random from a list of primes.
+  size_t ans = string_hasher(io.name) +
+      indexes_hasher(io.indexes) +
+      19249  * io.features.NumRows() +
+      14731 * io.features.NumCols();
+  return ans;
+}
+
+
+bool NnetIoStructureCompare::operator () (
+    const NnetIo &a, const NnetIo &b) const {
+  return a.name == b.name &&
+      a.features.NumRows() == b.features.NumRows() &&
+      a.features.NumCols() == b.features.NumCols() &&
+      a.indexes == b.indexes;
+}
+
+
+size_t NnetExampleStructureHasher::operator () (
+    const NnetExample &eg) const noexcept {
+  // these numbers were chosen at random from a list of primes.
+  NnetIoStructureHasher io_hasher;
+  size_t size = eg.io.size(), ans = size * 35099;
+  for (size_t i = 0; i < size; i++)
+    ans = ans * 19157 + io_hasher(eg.io[i]);
+  return ans;
+}
+
+bool NnetExampleStructureCompare::operator () (const NnetExample &a,
+                                               const NnetExample &b) const {
+  NnetIoStructureCompare io_compare;
+  if (a.io.size() != b.io.size())
+    return false;
+  size_t size = a.io.size();
+  for (size_t i = 0; i < size; i++)
+    if (!io_compare(a.io[i], b.io[i]))
+      return false;
+  return true;
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-example.h b/src/nnet3/nnet-example.h
index 1df7cd1e78e..347894e958c 100644
--- a/src/nnet3/nnet-example.h
+++ b/src/nnet3/nnet-example.h
@@ -75,6 +75,22 @@ struct NnetIo {
 };
 
 
+/// This hashing object hashes just the structural aspects of the NnetIo object
+/// (name, indexes, feature dimension) without looking at the value of features.
+/// It will be used in combining egs into batches of all similar structure.
+struct NnetIoStructureHasher {
+  size_t operator () (const NnetIo &a) const noexcept;
+};
+/// This comparison object compares just the structural aspects of the NnetIo
+/// object (name, indexes, feature dimension) without looking at the value of
+/// features.  It will be used in combining egs into batches of all similar
+/// structure.
+struct NnetIoStructureCompare {
+  bool operator () (const NnetIo &a,
+                    const NnetIo &b) const;
+};
+
+
 
 /// NnetExample is the input data and corresponding label (or labels) for one or
 /// more frames of input, used for standard cross-entropy training of neural
@@ -104,6 +120,40 @@ struct NnetExample {
 };
 
 
+/// This hashing object hashes just the structural aspects of the NnetExample
+/// without looking at the value of the features.  It will be used in combining
+/// egs into batches of all similar structure.  Note: the hash value is
+/// sensitive to the order in which the NnetIo elements (input and outputs)
+/// appear, even though the merging is capable of dealing with
+/// differently-ordered inputs and outputs (e.g.  "input" appearing before
+/// vs. after "ivector" or "output").  We don't think anyone would ever have to
+/// deal with differently-ordered, but otherwise identical, egs in practice so
+/// we don't bother making the hashing function independent of this order.
+struct NnetExampleStructureHasher {
+  size_t operator () (const NnetExample &eg) const noexcept;
+  // We also provide a version of this that works from pointers.
+  size_t operator () (const NnetExample *eg) const noexcept {
+    return (*this)(*eg);
+  }
+};
+
+
+/// This comparator object compares just the structural aspects of the
+/// NnetExample without looking at the value of the features.  Like
+/// NnetExampleStructureHasher, it is sensitive to the order in which the
+/// differently-named NnetIo elements appear.  This hashing object will be used
+/// in combining egs into batches of all similar structure.
+struct NnetExampleStructureCompare {
+  bool operator () (const NnetExample &a,
+                    const NnetExample &b) const;
+  // We also provide a version of this that works from pointers.
+  bool operator () (const NnetExample *a,
+                    const NnetExample *b) const { return (*this)(*a, *b); }
+
+};
+
+
+
 typedef TableWriter<KaldiObjectHolder<NnetExample > > NnetExampleWriter;
 typedef SequentialTableReader<KaldiObjectHolder<NnetExample > > SequentialNnetExampleReader;
 typedef RandomAccessTableReader<KaldiObjectHolder<NnetExample > > RandomAccessNnetExampleReader;
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 160ff1d089e..4aa65ce70ed 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -888,7 +888,15 @@ void BackpropTruncationComponent::Read(std::istream &is, bool binary) {
   ExpectOneOrTwoTokens(is, binary, "<BackpropTruncationComponent>",
                        "<Dim>");
   ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<ClippingThreshold>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<Scale>") {
+    ReadBasicType(is, binary, &scale_);
+    ReadToken(is, binary, &tok);
+  } else {
+    scale_ = 1.0;
+  }
+  KALDI_ASSERT(tok == "<ClippingThreshold>");
   ReadBasicType(is, binary, &clipping_threshold_);
   ExpectToken(is, binary, "<ZeroingThreshold>");
   ReadBasicType(is, binary, &zeroing_threshold_);
@@ -912,6 +920,8 @@ void BackpropTruncationComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "<BackpropTruncationComponent>");
   WriteToken(os, binary, "<Dim>");
   WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<Scale>");
+  WriteBasicType(os, binary, scale_);
   WriteToken(os, binary, "<ClippingThreshold>");
   WriteBasicType(os, binary, clipping_threshold_);
   WriteToken(os, binary, "<ZeroingThreshold>");
@@ -958,11 +968,14 @@ void BackpropTruncationComponentPrecomputedIndexes::Read(std::istream &istream,
 std::string BackpropTruncationComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
+         << ", scale=" << scale_
          << ", count=" << std::setprecision(3) << count_ << std::setprecision(6)
+         << ", recurrence-interval=" << recurrence_interval_
          << ", clipping-threshold=" << clipping_threshold_
          << ", clipped-proportion="
          << (count_ > 0.0 ? num_clipped_ / count_ : 0)
          << ", zeroing-threshold=" << zeroing_threshold_
+         << ", zeroing-interval=" << zeroing_interval_
          << ", zeroed-proportion="
          << (count_zeroing_boundaries_ > 0.0 ?
              num_zeroed_ / count_zeroing_boundaries_ : 0)
@@ -971,14 +984,15 @@ std::string BackpropTruncationComponent::Info() const {
   return stream.str();
 }
 
-void BackpropTruncationComponent::Init(int32 dim,
-                                 BaseFloat clipping_threshold,
-                                 BaseFloat zeroing_threshold,
-                                 int32 zeroing_interval,
-                                 int32 recurrence_interval) {
+void BackpropTruncationComponent::Init(
+    int32 dim, BaseFloat scale, BaseFloat clipping_threshold,
+    BaseFloat zeroing_threshold, int32 zeroing_interval,
+    int32 recurrence_interval) {
   KALDI_ASSERT(clipping_threshold >= 0 && zeroing_threshold >= 0 &&
-      zeroing_interval > 0 && recurrence_interval > 0 && dim > 0);
+               scale > 0.0 && zeroing_interval > 0 &&
+               recurrence_interval > 0 && dim > 0);
   dim_ = dim;
+  scale_ = scale;
   clipping_threshold_ = clipping_threshold;
   zeroing_threshold_ = zeroing_threshold;
   zeroing_interval_ = zeroing_interval;
@@ -993,9 +1007,11 @@ void BackpropTruncationComponent::Init(int32 dim,
 void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   bool ok = cfl->GetValue("dim", &dim);
-  BaseFloat clipping_threshold = 30.0;
-  BaseFloat zeroing_threshold = 15.0;
+  BaseFloat scale = 1.0,
+      clipping_threshold = 30.0,
+      zeroing_threshold = 15.0;
   int32 zeroing_interval = 20, recurrence_interval = 1;
+  cfl->GetValue("scale", &scale);
   cfl->GetValue("clipping-threshold", &clipping_threshold);
   cfl->GetValue("zeroing-threshold", &zeroing_threshold);
   cfl->GetValue("zeroing-interval", &zeroing_interval);
@@ -1005,7 +1021,7 @@ void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
       recurrence_interval < 1 || dim <= 0)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, clipping_threshold, zeroing_threshold,
+  Init(dim, scale, clipping_threshold, zeroing_threshold,
       zeroing_interval, recurrence_interval);
 }
 
@@ -1013,6 +1029,7 @@ void BackpropTruncationComponent::InitFromConfig(ConfigLine *cfl) {
 Component* BackpropTruncationComponent::Copy() const {
   BackpropTruncationComponent *ans = new BackpropTruncationComponent();
   ans->dim_ = dim_;
+  ans->scale_ = scale_;
   ans->clipping_threshold_ = clipping_threshold_;
   ans->zeroing_threshold_ = zeroing_threshold_;
   ans->zeroing_interval_ = zeroing_interval_;
@@ -1066,6 +1083,8 @@ void BackpropTruncationComponent::Propagate(
                                  const CuMatrixBase<BaseFloat> &in,
                                  CuMatrixBase<BaseFloat> *out) const {
   out->CopyFromMat(in);
+  if (scale_ != 1.0)
+    out->Scale(scale_);
 }
 
 // virtual
@@ -1084,6 +1103,8 @@ void BackpropTruncationComponent::Backprop(const std::string &debug_info,
   // the following statement will do nothing if in_deriv and out_deriv have same
   // memory.
   in_deriv->CopyFromMat(out_deriv);
+  if (scale_ != 1.0)
+    in_deriv->Scale(scale_);
 
   BackpropTruncationComponent *to_update =
       dynamic_cast<BackpropTruncationComponent*>(to_update_in);
@@ -1168,5 +1189,192 @@ void BackpropTruncationComponent::Add(BaseFloat alpha,
   num_zeroed_ += alpha * other->num_zeroed_;
 }
 
+
+std::string ConstantComponent::Info() const {
+  std::ostringstream stream;
+  stream << UpdatableComponent::Info()
+         << ", " << Type()
+         << ", output-dim=" << OutputDim()
+         << ", is-updatable=" << std::boolalpha << is_updatable_
+         << ", use-natural-gradient=" << std::boolalpha
+         << use_natural_gradient_;
+  PrintParameterStats(stream, "output", output_, true);
+  return stream.str();
+}
+
+ConstantComponent::ConstantComponent():
+    UpdatableComponent(), is_updatable_(true),
+    use_natural_gradient_(true) { }
+
+ConstantComponent::ConstantComponent(
+    const ConstantComponent &other):
+    UpdatableComponent(other), output_(other.output_),
+    is_updatable_(other.is_updatable_),
+    use_natural_gradient_(other.use_natural_gradient_),
+    preconditioner_(other.preconditioner_) { }
+
+void ConstantComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+  out->CopyRowsFromVec(output_);
+}
+
+void ConstantComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // we don't update in_deriv, since we set the flag
+  // kBackpropAdds, and the output doesn't depend on the
+  // input, so the input-derivative is zero.
+  if (to_update_in) {
+    ConstantComponent *to_update =
+      dynamic_cast<ConstantComponent*>(to_update_in);
+    if (to_update->is_updatable_) {
+      // only do the update if the is_updatable_ flag is set.
+      KALDI_ASSERT(to_update && to_update->is_updatable_);
+      if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
+        CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
+        BaseFloat scale = 1.0;
+        to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
+                                                          NULL, &scale);
+        to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
+                                        out_deriv_copy);
+      } else {
+        to_update->output_.AddRowSumMat(to_update->learning_rate_,
+                                        out_deriv);
+      }
+    }
+  }
+}
+
+void ConstantComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<ConstantComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<LearningRateFactor>") {
+    ReadBasicType(is, binary, &learning_rate_factor_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_factor_ = 1.0;
+  }
+  if (token == "<IsGradient>") {
+    ReadBasicType(is, binary, &is_gradient_);
+    ReadToken(is, binary, &token);
+  } else {
+    is_gradient_ = false;
+  }
+  if (token == "<MaxChange>") {
+    ReadBasicType(is, binary, &max_change_);
+    ReadToken(is, binary, &token);
+  } else {
+    max_change_ = 0.0;
+  }
+  if (token == "<LearningRate>") {
+    ReadBasicType(is, binary, &learning_rate_);
+    ReadToken(is, binary, &token);
+  } else {
+    learning_rate_ = 0.001;
+  }
+  if (token != "<Output>") {
+    KALDI_ERR << "Expected token <Output>, got " << token;
+  }
+  output_.Read(is, binary);
+  ExpectToken(is, binary, "<IsUpdatable>");
+  ReadBasicType(is, binary, &is_updatable_);
+  ExpectToken(is, binary, "<UseNaturalGradient>");
+  ReadBasicType(is, binary, &use_natural_gradient_);
+  ExpectToken(is, binary, "</ConstantComponent>");
+}
+
+void ConstantComponent::Write(std::ostream &os, bool binary) const {
+  WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
+  WriteToken(os, binary, "<Output>");
+  output_.Write(os, binary);
+  WriteToken(os, binary, "<IsUpdatable>");
+  WriteBasicType(os, binary, is_updatable_);
+  WriteToken(os, binary, "<UseNaturalGradient>");
+  WriteBasicType(os, binary, use_natural_gradient_);
+  WriteToken(os, binary, "</ConstantComponent>");
+}
+
+Component* ConstantComponent::Copy() const {
+  return new ConstantComponent(*this);
+}
+
+void ConstantComponent::Scale(BaseFloat scale) {
+  if (is_updatable_) {
+    if (scale == 0.0) {
+      output_.SetZero();
+    } else {
+      output_.Scale(scale);
+    }
+  }
+}
+
+void ConstantComponent::Add(BaseFloat alpha, const Component &other_in) {
+  if (is_updatable_) {
+    const ConstantComponent *other =
+        dynamic_cast<const ConstantComponent*>(&other_in);
+    KALDI_ASSERT(other != NULL);
+    output_.AddVec(alpha, other->output_);
+  }
+}
+
+void ConstantComponent::PerturbParams(BaseFloat stddev) {
+  CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
+  temp_output.SetRandn();
+  output_.AddVec(stddev, temp_output);
+}
+
+BaseFloat ConstantComponent::DotProduct(
+    const UpdatableComponent &other_in) const {
+  KALDI_ASSERT(is_updatable_);
+  const ConstantComponent *other =
+      dynamic_cast<const ConstantComponent*>(&other_in);
+  KALDI_ASSERT(other != NULL);
+  return VecVec(output_, other->output_);
+}
+
+void ConstantComponent::InitFromConfig(ConfigLine *cfl) {
+  int32 output_dim = 0;
+  InitLearningRatesFromConfig(cfl);
+  bool ok = cfl->GetValue("output-dim", &output_dim);
+  cfl->GetValue("is-updatable", &is_updatable_);
+  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
+  BaseFloat output_mean = 0.0, output_stddev = 0.0;
+  cfl->GetValue("output-mean", &output_mean);
+  cfl->GetValue("output-stddev", &output_stddev);
+  if (!ok || cfl->HasUnusedValues() || output_dim <= 0) {
+    KALDI_ERR << "Bad initializer " << cfl->WholeLine();
+  }
+  Vector<BaseFloat> output(output_dim);
+  output.SetRandn();
+  output.Scale(output_stddev);
+  output.Add(output_mean);
+  output_ = output;
+}
+
+int32 ConstantComponent::NumParameters() const {
+  KALDI_ASSERT(is_updatable_);
+  return output_.Dim();
+}
+
+void ConstantComponent::Vectorize(VectorBase<BaseFloat> *params) const {
+  params->CopyFromVec(output_);
+}
+
+void ConstantComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
+  output_.CopyFromVec(params);
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 9750852544e..2ddf4f40172 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -156,9 +156,9 @@ class DistributeComponentPrecomputedIndexes:
   StatisticsPoolingComponent to extract moving-average mean and
   standard-deviation statistics.
 
-  StatisticsExtractionComponent is designed to extract statistics-- 0th-order,
+  StatisticsExtractionExomponent designed to extract statistics-- 0th-order,
   1st-order and optionally diagonal 2nd-order stats-- from small groups of
-  frames, such as 10 frames.  The statistics will then be further processed by
+  frames, such as 10 frame.  The statistics will then be further processed by
   StatisticsPoolingComponent to compute moving-average means and (if configured)
   standard deviations.  The reason for the two-component way of doing this is
   efficiency, particularly in the graph-compilation phase.  (Otherwise there
@@ -185,7 +185,7 @@ class DistributeComponentPrecomputedIndexes:
   An output of this component will be 'computable' any time at least one of
   the corresponding inputs is computable.
 
-  In all cases the first dimension of the output will be a count (between 1 and
+   In all cases the first dimension of the output will be a count (between 1 and
   10 inclusive in this example).  If include-variance=false, then the output
   dimension will be input-dim + 1.  and the output dimensions >0 will be
   1st-order statistics (sums of the input).  If include-variance=true, then the
@@ -448,21 +448,22 @@ class StatisticsPoolingComponentPrecomputedIndexes:
 class BackpropTruncationComponent: public Component {
  public:
   BackpropTruncationComponent(int32 dim,
+                              BaseFloat scale,
                               BaseFloat clipping_threshold,
                               BaseFloat zeroing_threshold,
                               int32 zeroing_interval,
                               int32 recurrence_interval) {
-    Init(dim, clipping_threshold, zeroing_threshold,
+    Init(dim, scale, clipping_threshold, zeroing_threshold,
         zeroing_interval, recurrence_interval);}
 
-  BackpropTruncationComponent(): dim_(0), clipping_threshold_(-1),
+  BackpropTruncationComponent(): dim_(0), scale_(1.0), clipping_threshold_(-1),
     zeroing_threshold_(-1), zeroing_interval_(0), recurrence_interval_(0),
     num_clipped_(0), num_zeroed_(0), count_(0), count_zeroing_boundaries_(0) { }
 
   virtual int32 InputDim() const { return dim_; }
   virtual int32 OutputDim() const { return dim_; }
   virtual void InitFromConfig(ConfigLine *cfl);
-  void Init(int32 dim, BaseFloat clipping_threshold,
+  void Init(int32 dim, BaseFloat scale, BaseFloat clipping_threshold,
             BaseFloat zeroing_threshold, int32 zeroing_interval,
             int32 recurrence_interval);
 
@@ -506,6 +507,12 @@ class BackpropTruncationComponent: public Component {
   // input/output dimension
   int32 dim_;
 
+  // Scale that is applied in the forward propagation (and of course in the
+  // backprop to match.  Expected to normally be 1, but setting this to other
+  // values (e.g.  slightly less than 1) can be used to produce variants of
+  // LSTMs where the activations are bounded.
+  BaseFloat scale_;
+
   // threshold (e.g., 30) to be used for clipping corresponds to max-row-norm
   BaseFloat clipping_threshold_;
 
@@ -573,6 +580,97 @@ class BackpropTruncationComponentPrecomputedIndexes:
   }
 };
 
+
+// ConstantComponent returns a constant value for all requested
+// indexes, and it has no dependencies on any input.
+// It's like a ConstantFunctionComponent, but done the "right"
+// way without requiring an unnecessary input.
+// It is optionally trainable, and optionally you can use natural
+// gradient.
+class ConstantComponent: public UpdatableComponent {
+ public:
+  // actually this component requires no inputs; this value
+  // is really a don't-care.
+  virtual int32 InputDim() const { return output_.Dim(); }
+
+  virtual int32 OutputDim() const { return output_.Dim(); }
+
+  virtual std::string Info() const;
+
+  // possible parameter values with their defaults:
+  // is-updatable=true use-natural-gradient=true output-dim=-1
+  // output-mean=0 output-stddev=0
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  ConstantComponent();
+
+  ConstantComponent(const ConstantComponent &other);
+
+  virtual std::string Type() const { return "ConstantComponent"; }
+  virtual int32 Properties() const {
+    return
+        (is_updatable_ ? kUpdatableComponent|kLinearInParameters : 0);
+  }
+  virtual void Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  // Some functions that are only to be reimplemented for GeneralComponents.
+  virtual void GetInputIndexes(const MiscComputationInfo &misc_info,
+                               const Index &output_index,
+                               std::vector<Index> *desired_indexes) const {
+    desired_indexes->clear();  // requires no inputs.
+  }
+
+  // This function returns true if at least one of the input indexes used to
+  // compute this output index is computable.
+  // it's simple because this component requires no inputs.
+  virtual bool IsComputable(const MiscComputationInfo &misc_info,
+                            const Index &output_index,
+                            const IndexSet &input_index_set,
+                            std::vector<Index> *used_inputs) const {
+    if (used_inputs) used_inputs->clear();
+    return true;
+  }
+
+  // Some functions from base-class UpdatableComponent.
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void PerturbParams(BaseFloat stddev);
+  virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
+  virtual int32 NumParameters() const;
+  virtual void Vectorize(VectorBase<BaseFloat> *params) const;
+  virtual void UnVectorize(const VectorBase<BaseFloat> &params);
+ private:
+
+  // the output value-- a vector.
+  CuVector<BaseFloat> output_;
+
+  bool is_updatable_;
+  // if true, and if updatable, do natural-gradient update.
+  bool use_natural_gradient_;
+  OnlineNaturalGradient preconditioner_;
+
+  const ConstantComponent &operator
+  = (const ConstantComponent &other); // Disallow.
+};
+
+
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-graph.cc b/src/nnet3/nnet-graph.cc
index e66a34fc26a..a0216b9189f 100644
--- a/src/nnet3/nnet-graph.cc
+++ b/src/nnet3/nnet-graph.cc
@@ -39,7 +39,7 @@ void NnetToDirectedGraph(const Nnet &nnet,
     switch (node.node_type) {
       case kInput:
         break;  // no node dependencies.
-      case kDescriptor: 
+      case kDescriptor:
         node.descriptor.GetNodeDependencies(&node_dependencies);
         break;
       case kComponent:
@@ -265,7 +265,7 @@ std::string PrintGraphToString(const std::vector<std::vector<int32> > &graph) {
 void ComputeNnetComputationEpochs(const Nnet &nnet,
                                   std::vector<int32> *node_to_epoch) {
   KALDI_ASSERT(node_to_epoch != NULL);
-  
+
   std::vector<std::vector<int32> > graph;
   NnetToDirectedGraph(nnet, &graph);
   KALDI_VLOG(6) << "graph is: " << PrintGraphToString(graph);
@@ -276,7 +276,7 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
   std::vector<std::vector<int32> > scc_graph;
   MakeSccGraph(graph, sccs, &scc_graph);
   KALDI_VLOG(6) << "scc graph is: " << PrintGraphToString(scc_graph);
-  
+
   std::vector<int32> scc_node_to_epoch;
   ComputeTopSortOrder(scc_graph, &scc_node_to_epoch);
   if (GetVerboseLevel() >= 6) {
@@ -285,7 +285,7 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
       os << scc_node_to_epoch[i] << ", ";
     KALDI_VLOG(6) << "scc_node_to_epoch is: " << os.str();
   }
-  
+
   node_to_epoch->clear();
   node_to_epoch->resize(graph.size());
   for (int32 i = 0; i < sccs.size(); ++i) {
@@ -297,5 +297,21 @@ void ComputeNnetComputationEpochs(const Nnet &nnet,
   }
 }
 
+bool GraphHasCycles(const std::vector<std::vector<int32> > &graph) {
+  std::vector<std::vector<int32> > sccs;
+  FindSccs(graph, &sccs);
+  for (size_t i = 0; i < sccs.size(); i++) {
+    if (sccs[i].size() > 1)
+      return true;
+  }
+  // the next code checks for links from a state to itself.
+  int32 num_nodes = graph.size();
+  for (size_t i = 0; i < num_nodes; i++)
+    for (std::vector<int32>::const_iterator iter = graph[i].begin(),
+             end = graph[i].end(); iter != end; ++iter)
+      if (*iter == i) return true;
+  return false;
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-graph.h b/src/nnet3/nnet-graph.h
index 24c26176742..27e3fd609f3 100644
--- a/src/nnet3/nnet-graph.h
+++ b/src/nnet3/nnet-graph.h
@@ -55,10 +55,18 @@ void NnetToDirectedGraph(const Nnet &nnet,
 /// of destination-nodes of arcs coming from the current node),
 /// partition it into strongly connected components (i.e. within
 /// each SCC, all nodes are reachable from all other nodes).
+/// Each element of 'sccs' is a list of node indexes that are
+/// in that scc.
 void FindSccs(const std::vector<std::vector<int32> > &graph,
               std::vector<std::vector<int32> > *sccs);
 
 
+/// This function returns 'true' if the graph represented in 'graph'
+/// contains cycles (including cycles where a single node has an arc
+/// to itself).
+bool GraphHasCycles(const std::vector<std::vector<int32> > &graph);
+
+
 /// Given a list of sccs of a graph (e.g. as computed by FindSccs), compute a
 /// directed graph on the sccs.  Of course this directed graph will be acyclic.
 void MakeSccGraph(const std::vector<std::vector<int32> > &graph,
diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc
index ad5f715a294..dd90af739e7 100644
--- a/src/nnet3/nnet-nnet.cc
+++ b/src/nnet3/nnet-nnet.cc
@@ -23,6 +23,8 @@
 #include "nnet3/nnet-parse.h"
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/am-nnet-simple.h"
+#include "hmm/transition-model.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -565,8 +567,28 @@ void Nnet::GetSomeNodeNames(
   }
 }
 
+void Nnet::Swap(Nnet *other) {
+  component_names_.swap(other->component_names_);
+  components_.swap(other->components_);
+  node_names_.swap(other->node_names_);
+  nodes_.swap(other->nodes_);
+}
+
 void Nnet::Read(std::istream &is, bool binary) {
   Destroy();
+  int first_char = PeekToken(is, binary);
+  if (first_char == 'T') {
+    // This branch is to allow '.mdl' files (containing a TransitionModel
+    // and then an AmNnetSimple) to be read where .raw files (containing
+    // just an Nnet) would be expected.  This is often convenient.
+    TransitionModel temp_trans_model;
+    temp_trans_model.Read(is, binary);
+    AmNnetSimple temp_am_nnet;
+    temp_am_nnet.Read(is, binary);
+    temp_am_nnet.GetNnet().Swap(this);
+    return;
+  }
+
   ExpectToken(is, binary, "<Nnet3>");
   std::ostringstream config_file_out;
   std::string cur_line;
diff --git a/src/nnet3/nnet-nnet.h b/src/nnet3/nnet-nnet.h
index 16e8333d5b1..5eb87fd30f3 100644
--- a/src/nnet3/nnet-nnet.h
+++ b/src/nnet3/nnet-nnet.h
@@ -125,10 +125,10 @@ class Nnet {
 
   int32 NumNodes() const { return nodes_.size(); }
 
-  /// return component indexed c.  not a copy; not owned by caller.
+  /// Return component indexed c.  Not a copy; not owned by caller.
   Component *GetComponent(int32 c);
 
-  /// return component indexed c (const version).  not a copy; not owned by
+  /// Return component indexed c (const version).  Not a copy; not owned by
   /// caller.
   const Component *GetComponent(int32 c) const;
 
@@ -233,6 +233,8 @@ class Nnet {
 
   Nnet *Copy() const { return new Nnet(*this); }
 
+  void Swap(Nnet *other);
+
   // Assignment operator
   Nnet& operator =(const Nnet &nnet);
 
@@ -247,9 +249,19 @@ class Nnet {
   void RemoveSomeNodes(const std::vector<int32> &nodes_to_remove);
 
   void ResetGenerators(); // resets random-number generators for all
-  // random components.  You must also set srand() for this to be
-  // effective.
-  
+  // random components.  You must call srand() prior to this call, for this to
+  // be effective.
+
+
+  // This function outputs to "config_lines" the lines of a config file.  If you
+  // provide include_dim=false, this will enable you to reconstruct the nodes in
+  // the network (but not the components, which need to be written separately).
+  // If you provide include_dim=true, it also adds extra information about
+  // node dimensions which is useful for a human reader but won't be
+  // accepted as the config-file format.
+  void GetConfigLines(bool include_dim,
+                      std::vector<std::string> *config_lines) const;
+
  private:
 
   void Destroy();
@@ -261,14 +273,6 @@ class Nnet {
   // include dimension information that would not be provided in a config file.
   std::string GetAsConfigLine(int32 node_index, bool include_dim) const;
 
-  // This function outputs to "config_lines" the lines of a config file.  If you
-  // provide include_dim=false, this will enable you to reconstruct the nodes in
-  // the network (but not the components, which need to be written separately).
-  // If you provide include_dim=true, it also adds extra information about
-  // node dimensions which is useful for a human reader but won't be
-  // accepted as the config-file format.
-  void GetConfigLines(bool include_dim,
-                      std::vector<std::string> *config_lines) const;
 
   // This function is used when reading config files; it exists in order to
   // handle replacement of existing nodes.  The two input vectors have the same
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index 8fa1ef87e36..0044ee05c51 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -27,129 +27,136 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Run the test wothout optimizations and with optimizations specified by the
-// parameter. Only print warnings; we'll fail the whole test later.
-static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
+// Run the test without optimizations and with optimizations specified by the
+// configs (the optimized version is done with class CachingOptimizingCompiler).
+// Only print warnings; we'll fail the whole test later.
+static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed,
+                                            NnetOptimizeOptions opt_config,
+                                            CachingOptimizingCompilerOptions compiler_config) {
+
   //opt_config.convert_addition = false;
   //opt_config.remove_assignments = false;
   //opt_config.move_sizing_commands = false;
   //opt_config.allocate_from_other = false;
 
-  srand(0);  // Every run must be deterministic.
-  for (int32 n = 0; n < 40; n++) {
-    struct NnetGenerationOptions gen_config;
-
-    std::vector<std::string> configs;
-    GenerateConfigSequence(gen_config, &configs);
-    Nnet nnet;
-    for (size_t j = 0; j < configs.size(); j++) {
-      KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
-      std::istringstream is(configs[j]);
-      nnet.ReadConfig(is);
-    }
+  srand(srand_seed);  // so that we can compare between differnt optimization types
+                      // with the randomly generated network staying the same.
 
-    ComputationRequest request;
-    std::vector<Matrix<BaseFloat> > inputs;
-    ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
+  struct NnetGenerationOptions gen_config;
 
-    NnetComputation computation;
-    Compiler compiler(request, nnet);
+  std::vector<std::string> configs;
+  GenerateConfigSequence(gen_config, &configs);
+  Nnet nnet;
+  for (size_t j = 0; j < configs.size(); j++) {
+    KALDI_LOG << "Input config[" << j << "] is: " << configs[j];
+    std::istringstream is(configs[j]);
+    nnet.ReadConfig(is);
+  }
 
-    CompilerOptions opts;
-    compiler.CreateComputation(opts, &computation);
-    {
-      std::ostringstream os;
-      computation.Print(os, nnet);
-      KALDI_LOG << "Generated computation is: " << os.str();
-    }
-    CheckComputationOptions check_config;
-    // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;
-    ComputationChecker checker(check_config, nnet, computation);
-    checker.Check();
-
-    NnetComputation computation_opt(computation);
-
-    {
-      Optimize(opt_config, nnet, request, &computation_opt);
-      std::ostringstream os;
-      computation_opt.Print(os, nnet);
-      KALDI_LOG << "Optimized computation is: " << os.str();
-    }
+  ComputationRequest request;
+  std::vector<Matrix<BaseFloat> > inputs;
+  ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
 
-    NnetComputeOptions compute_opts;
-    if (RandInt(0, 1) == 0)
-      compute_opts.debug = true;
-
-    computation.ComputeCudaIndexes();
-    computation_opt.ComputeCudaIndexes();
-    Nnet nnet_to_update(nnet);  // copy of the nnet that we update...  needed to
-                                // test the consolidation of backprop commands,
-                                // otherwise the optimized and non-optimized
-                                // comptuations differ.
-    bool is_gradient = true;  // with natural gradient, the consolidation would
-                              // affect the final model params -> test just the
-                              // gradient.
-    SetZero(is_gradient, &nnet_to_update);
-
-    NnetComputer computer(compute_opts,
-                          computation,
-                          nnet,
-                          &nnet_to_update);
-
-    Nnet nnet_opt(nnet);  // copy of the nnet for the optimized computation.
-                          // necessary in case backprop changes parameters.
-    Nnet nnet_opt_to_update(nnet_opt);
-    SetZero(is_gradient, &nnet_opt_to_update);
-
-    // NnetComputer for the optimized version of the computation.
-    NnetComputer computer_opt(compute_opts,
-                              computation_opt,
-                              nnet_opt,
-                              &nnet_opt_to_update);
-
-    // provide the input to the computations.
-    for (size_t i = 0; i < request.inputs.size(); i++) {
-      CuMatrix<BaseFloat> temp(inputs[i]);
-      KALDI_LOG << "Input sum is " << temp.Sum();
-      computer.AcceptInput(request.inputs[i].name, &temp);
-      CuMatrix<BaseFloat> temp2(inputs[i]);
-      computer_opt.AcceptInput(request.inputs[i].name, &temp2);
-    }
-    KALDI_LOG << "Running non-optimized forward computation";
-    computer.Forward();
-    KALDI_LOG << "Running optimized forward computation";
-    computer_opt.Forward();
-
-    const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
-    KALDI_LOG << "Output sum (not optimized) is " << output.Sum();
-    const CuMatrixBase<BaseFloat> &output_opt(computer_opt.GetOutput("output"));
-    KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum();
-    if (!ApproxEqual(output, output_opt)) {
-      KALDI_WARN << "Non-optimized and optimized versions of the computation give "
-                 << "different outputs.";
-      return false;
-    }
+  NnetComputation computation;
+  Compiler compiler(request, nnet);
 
-    CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
-    output_deriv.SetRandn();
-    CuMatrix<BaseFloat> output_deriv_opt(output_deriv);
+  CompilerOptions opts;
+  compiler.CreateComputation(opts, &computation);
+  {
+    std::ostringstream os;
+    computation.Print(os, nnet);
+    KALDI_LOG << "Generated computation with no optimization or shortcut is: " << os.str();
+  }
+  CheckComputationOptions check_config;
+  // we can do the rewrite check since it's before optimization.
+  check_config.check_rewrite = true;
+  ComputationChecker checker(check_config, nnet, computation);
+  checker.Check();
 
-    if (request.outputs[0].has_deriv) {
-      computer.AcceptOutputDeriv("output", &output_deriv);
-      computer_opt.AcceptOutputDeriv("output", &output_deriv_opt);
-    }
+  CachingOptimizingCompiler opt_compiler(nnet, opt_config, compiler_config);
+
+  const NnetComputation &computation_opt = *opt_compiler.Compile(request);
+
+  {
+    std::ostringstream os;
+    computation_opt.Print(os, nnet);
+    KALDI_LOG << "Optimized computation is: " << os.str();
+  }
+
+  NnetComputeOptions compute_opts;
+  if (RandInt(0, 1) == 0)
+    compute_opts.debug = true;
+
+  computation.ComputeCudaIndexes();
+  // computation_opt has already had this function called.
+
+  Nnet nnet_to_update(nnet);  // copy of the nnet that we update...  needed to
+  // test the consolidation of backprop commands,
+  // otherwise the optimized and non-optimized
+  // comptuations differ.
+  ScaleNnet(0.0, &nnet_to_update);
+  // with natural gradient, the consolidation would affect the final model
+  // params -> test just the gradient.
+  SetNnetAsGradient(&nnet_to_update);
+
+  NnetComputer computer(compute_opts,
+                        computation,
+                        nnet,
+                        &nnet_to_update);
+
+  Nnet nnet_opt(nnet);  // copy of the nnet for the optimized computation.
+  // necessary in case backprop changes parameters.
+  Nnet nnet_opt_to_update(nnet_opt);
+  ScaleNnet(0.0, &nnet_opt_to_update);
+  SetNnetAsGradient(&nnet_opt_to_update);
+
+  // NnetComputer for the optimized version of the computation.
+  NnetComputer computer_opt(compute_opts,
+                            computation_opt,
+                            nnet_opt,
+                            &nnet_opt_to_update);
+
+  // provide the input to the computations.
+  for (size_t i = 0; i < request.inputs.size(); i++) {
+    CuMatrix<BaseFloat> temp(inputs[i]);
+    KALDI_LOG << "Input sum is " << temp.Sum();
+    computer.AcceptInput(request.inputs[i].name, &temp);
+    CuMatrix<BaseFloat> temp2(inputs[i]);
+    computer_opt.AcceptInput(request.inputs[i].name, &temp2);
+  }
+  KALDI_LOG << "Running non-optimized forward computation";
+  computer.Run();
+  KALDI_LOG << "Running optimized forward computation";
+  computer_opt.Run();
+
+  const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
+  KALDI_LOG << "Output sum (not optimized) is " << output.Sum();
+  const CuMatrixBase<BaseFloat> &output_opt(computer_opt.GetOutput("output"));
+  KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum();
+  if (!ApproxEqual(output, output_opt)) {
+    KALDI_WARN << "Non-optimized and optimized versions of the computation give "
+               << "different outputs.";
+    return false;
+  }
+
+  CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
+  output_deriv.SetRandn();
+  CuMatrix<BaseFloat> output_deriv_opt(output_deriv);
+
+  if (request.outputs[0].has_deriv) {
+    computer.AcceptInput("output", &output_deriv);
+    computer_opt.AcceptInput("output", &output_deriv_opt);
 
     KALDI_LOG << "Running non-optimized backward computation";
-    computer.Backward();
+    computer.Run();
     KALDI_LOG << "Running optimized backward computation";
-    computer_opt.Backward();
+    computer_opt.Run();
     for (size_t i = 0; i < request.inputs.size(); i++) {
       if (request.inputs[i].has_deriv) {
         const CuMatrixBase<BaseFloat> &in_deriv =
-            computer.GetInputDeriv(request.inputs[i].name);
+            computer.GetOutput(request.inputs[i].name);
         const CuMatrixBase<BaseFloat> &in_deriv_opt =
-            computer_opt.GetInputDeriv(request.inputs[i].name);
+            computer_opt.GetOutput(request.inputs[i].name);
         KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
                   << "' (non-optimized) is " << in_deriv.Sum();
         KALDI_LOG << "Input-deriv sum for input '" << request.inputs[i].name
@@ -161,22 +168,25 @@ static bool UnitTestNnetOptimizeWithOptions(NnetOptimizeOptions opt_config) {
         }
       }
     }
+  }
 
-    if (!NnetParametersAreIdentical(nnet_to_update,
-                                    nnet_opt_to_update, 1.0e-05)) {
-      KALDI_WARN << "Neural networks differ after training, between "
-                 << "optimized and non-optimized computation.";
-      return false;
-    }
+  if (!NnetParametersAreIdentical(nnet_to_update,
+                                  nnet_opt_to_update, 1.0e-05)) {
+    KALDI_WARN << "Neural networks differ after training, between "
+               << "optimized and non-optimized computation.";
+    return false;
+  } else {
+    return true;
   }
-  return true;
 }
 
 
 // This test runs the computation with and without optimization, and checks that
 // the outputs are the same.
-static void UnitTestNnetOptimize() {
+static void UnitTestNnetOptimizeInternal(int32 srand_seed) {
   NnetOptimizeOptions optimize_all;
+  CachingOptimizingCompilerOptions compiler_all;
+
   // randomly sometimes set min_deriv and max_deriv to small/large values,
   // which will cause some of the LimitDerivativeTimes() code to be called
   // (without really changing anything).
@@ -185,44 +195,89 @@ static void UnitTestNnetOptimize() {
 
   // this is useful for debugging as it removes nans:
   // optimize_all.initialize_undefined = false;
-  bool success = UnitTestNnetOptimizeWithOptions(optimize_all);
+  bool success = UnitTestNnetOptimizeWithOptions(srand_seed, optimize_all,
+                                                 compiler_all);
   if (success)
     return;
 
   // Test failed with full optimization. Slowly retry with various
   // optimizations switched off.
   NnetOptimizeOptions optimize = optimize_all;
-  optimize.propagate_in_place = false;
-  bool succ_no_propagate_in_place = UnitTestNnetOptimizeWithOptions(optimize);
+  CachingOptimizingCompilerOptions compiler = compiler_all;
+
+
+  compiler.use_shortcut = false;
+  bool succ_no_shortcut = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                          compiler);
+  compiler = compiler_all;
 
+
+  optimize.propagate_in_place = false;
+  bool succ_no_propagate_in_place = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                    compiler);
   optimize = optimize_all;
+
   optimize.backprop_in_place = false;
-  bool succ_no_backprop_in_place = UnitTestNnetOptimizeWithOptions(optimize);
+  bool succ_no_backprop_in_place = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                   compiler);
+  optimize = optimize_all;
 
+  optimize.optimize_row_ops = false;
+  bool succ_no_row_ops = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                         compiler);
   optimize = optimize_all;
-  optimize.remove_assignments = false;
-  bool succ_no_remove_assignments = UnitTestNnetOptimizeWithOptions(optimize);
 
+  optimize.convert_addition = false;
+  bool succ_no_convert_addition = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                  compiler);
+  optimize = optimize_all;
+
+  optimize.remove_assignments = false;
+  bool succ_no_remove_assignments = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                    compiler);
   optimize = optimize_all;
+
   optimize.initialize_undefined = false;
-  bool succ_no_initialize_undefined = UnitTestNnetOptimizeWithOptions(optimize);
+  bool succ_no_initialize_undefined = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                      compiler);
+  optimize = optimize_all;
 
+  optimize.allocate_from_other = false;
+  bool succ_no_allocate_from_other = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                     compiler);
   optimize = optimize_all;
+
   optimize.move_sizing_commands = false;
-  bool succ_no_move_sizing_commands = UnitTestNnetOptimizeWithOptions(optimize);
+  bool succ_no_move_sizing_commands = UnitTestNnetOptimizeWithOptions(srand_seed, optimize,
+                                                                      compiler);
+  optimize = optimize_all;
 
 #define KALDI_SUCCFAIL(b) ((b) ? "SUCCESS" : "FAILURE")
   KALDI_ERR
     << "Test failed with all optimizations enabled. Retried test with the "
     << "following optimizations turned off:"
+    << "\n  use_shortcut         ... " << KALDI_SUCCFAIL(succ_no_shortcut)
     << "\n  propagate_in_place   ... " << KALDI_SUCCFAIL(succ_no_propagate_in_place)
     << "\n  backprop_in_place    ... " << KALDI_SUCCFAIL(succ_no_backprop_in_place)
+    << "\n  optimize_row_ops     ... " << KALDI_SUCCFAIL(succ_no_row_ops)
+    << "\n  convert_addition     ... " << KALDI_SUCCFAIL(succ_no_convert_addition)
     << "\n  remove_assignments   ... " << KALDI_SUCCFAIL(succ_no_remove_assignments)
     << "\n  initialize_undefined ... " << KALDI_SUCCFAIL(succ_no_initialize_undefined)
+    << "\n  allocate_from_other  ... " << KALDI_SUCCFAIL(succ_no_allocate_from_other)
     << "\n  move_sizing_commands ... " << KALDI_SUCCFAIL(succ_no_move_sizing_commands);
 #undef KALDI_SUCCFAIL
 }
 
+static void UnitTestNnetOptimize() {
+  for (int32 srand_seed = 0; srand_seed < 40; srand_seed++) {
+    KALDI_LOG << "About to run UnitTestNnetOptimizeInternal with srand_seed = "
+              << srand_seed;
+    UnitTestNnetOptimizeInternal(srand_seed);
+  }
+}
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index df7f975db86..60ec93f3f18 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -33,8 +33,12 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kAllocMatrixZeroed:
     case kAllocMatrixUndefined:
     case kDeallocMatrix:
+      submatrix_args->push_back(&c->arg1);
+      break;
     case kAllocMatrixFromOther:
     case kAllocMatrixFromOtherZeroed:
+      submatrix_args->push_back(&c->arg1);
+      submatrix_args->push_back(&c->arg2);
       break;
     case kPropagate:
       submatrix_args->push_back(&c->arg3);
@@ -64,8 +68,13 @@ void IdentifySubmatrixArgs(NnetComputation::Command *c,
     case kCopyToRowsMulti:
       submatrix_args->push_back(&c->arg1);
       break;
+    case kAcceptInput: case kProvideOutput:
+      submatrix_args->push_back(&c->arg1);
+      break;
     case kNoOperation:
     case kNoOperationMarker:
+    case kNoOperationLabel:
+    case kGotoLabel:
       break;
     default:
       KALDI_ERR << "Unknown command type.";
@@ -87,40 +96,13 @@ void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
 }
 
 
-void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *commands,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  std::vector<NnetComputation::Command>::iterator iter = commands->begin(),
-      end = commands->end();
-  std::vector<int32*> this_matrix_args;
-  for (; iter != end; ++iter) {
-    IdentifyMatrixArgs(&(*iter), &this_matrix_args);
-    matrix_args->insert(matrix_args->end(),
-                        this_matrix_args.begin(),
-                        this_matrix_args.end());
-  }
-}
-
 
-void IdentifyMatrixArgsInComputation(bool include_in_submatrices,
-                                     NnetComputation *computation,
+void IdentifyMatrixArgsInComputation(NnetComputation *computation,
                                      std::vector<int32*> *matrix_args) {
-  IdentifyMatrixArgs(&(computation->commands), matrix_args);
   int32 num_submatrices = computation->submatrices.size();
-  matrix_args->reserve(matrix_args->size() +
-                       (include_in_submatrices ?
-                        computation->submatrices.size() : 0) +
-                       2 * computation->input_output_info.size());
-  if (include_in_submatrices)
-    for (int32 s = 1; s < num_submatrices; s++)
-      matrix_args->push_back(&(computation->submatrices[s].matrix_index));
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    matrix_args->push_back(&(iter->second.first));
-    matrix_args->push_back(&(iter->second.second));
-  }
+  matrix_args->reserve(computation->submatrices.size());
+  for (int32 s = 1; s < num_submatrices; s++)
+    matrix_args->push_back(&(computation->submatrices[s].matrix_index));
 }
 
 
@@ -165,26 +147,112 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
   }
 }
 
-
-
-void IdentifyMatrixArgs(NnetComputation::Command *c,
-                        std::vector<int32*> *matrix_args) {
-  matrix_args->clear();
-  switch (c->command_type) {
-    case kAllocMatrixZeroed:
-    case kAllocMatrixUndefined:
-    case kDeallocMatrix:
-      matrix_args->push_back(&c->arg1);
-      break;
-    case kAllocMatrixFromOther:
-    case kAllocMatrixFromOtherZeroed:
-      matrix_args->push_back(&c->arg1);
-      matrix_args->push_back(&c->arg2);
-      break;
-    default:
-      break;
-  }
-}
+// We declare this class in the .cc file, we don't need to export it.
+// It's used inside RenumberComputation.
+class ComputationRenumberer {
+ public:
+  ComputationRenumberer(NnetComputation *computation):
+      computation_(computation) { }
+
+  void Renumber();
+ private:
+  // this function removes unused vectors within the indexes_multi_ array, i.e.
+  // ones that are not referenced in the computation.
+  void RemoveUnusedIndexesMulti();
+  // this function computes the submatrix_is_used_ vector, saying whether each
+  // of the original submatrices is referenced somewhere.
+  void ComputeSubmatrixIsUsed();
+  // this function computes the matrix_is_used_ vector (from the
+  // submatrix_is_used_ vector, from computation_->input_output_info, and from
+  // computation_->commands, saying whether each of the original matrices is
+  // referenced somewhere, directly or indirectly.
+  void ComputeMatrixIsUsed();
+  // This function sets up mappings from old to new matrix and submatrix indexes,
+  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
+  void SetUpMappings();
+  // This function renumbers submatrix indexes appearing within commands and
+  // indexes_multi_, and then removes unused submatrices from the list of
+  // submatrices while leaving the matrix-indexes at their old values (they will
+  // be mapped by RenumberMatrices()).
+  void RenumberSubmatrices();
+  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
+  // and 'input_output_info'; renumber 'matrices' and if applicable
+  // 'debug_info'.
+  void RenumberMatrices();
+  // removes duplicates within the indexes_multi array itself.
+  void RemoveIndexesMultiDuplicates();
+  // removes unused elements and duplicates within 'computation->indexes'
+  void RenumberIndexes();
+  // removes unused elements and duplicates within 'computation->indexes_ranges'
+  void RenumberIndexesRanges();
+
+  struct SubMatrixHasher {
+    SubMatrixHasher() { }
+    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const noexcept {
+      // these numbers are arbitrarily chosen primes.
+      return submat.matrix_index +
+          19553 * submat.row_offset +
+          29297 * submat.num_rows +
+          42209 * submat.col_offset +
+          56527 * submat.num_cols;
+    }
+  };
+
+
+  // Here, T will be int32 or std::pair<int32,int32>
+  template <class T>
+  struct PointerCompare {
+    // This provides an operator < on two vectors of ints or pairs of ints.  It
+    // is designed to provide a total order on the vectors while accessing as
+    // small a portion of the vectors' data as possible.  It's used in removing
+    // duplicates from computation_->indexes_multi and computation_->indexes.
+    // First it compares the length, then it does lexicographical compare.
+    bool operator ()(const std::vector<T> *ptr1,
+                     const std::vector<T> *ptr2) const {
+      size_t size1 = ptr1->size(), size2 = ptr2->size();
+      if (size1 < size2) return true;
+      else if (size1 > size2) return false;
+      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
+                                    // lexicographical comparison.
+    }
+  };
+
+  /// creates a renumbering that removes the elements in "to_remove",
+  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
+  /// the vector [ 0, -1, 1 ].
+  static void CreateRenumbering(int32 old_num_elements,
+                                const std::vector<int32> &to_remove,
+                                std::vector<int32> *renumbering);
+
+  /// creates a renumbering from old to new index that removes the unused
+  /// elements, e.g. if used == [ true, false, true, true], would output the
+  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
+  /// number of elements of 'used' that were true.
+  static int32 CreateRenumbering(const std::vector<bool> &used,
+                                 std::vector<int32> *renumbering);
+
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index is used somewhere in the computation (always true for
+  // the zeroth element).
+  std::vector<bool> submatrix_is_used_;
+  // vector of bool indexed by original submatrix-index, that is true if a
+  // submatrix-index will be kept; this is like submatrix_is_used_; but for
+  // duplicate submatrices, all but the first duplicate will be marked false).
+  std::vector<bool> submatrix_is_kept_;
+  // vector of bool indexed by original-matrix-index > 0, that is true if a
+  // matrix-index is used somewhere in the computation, directly or indirectly.
+  // always true for the zeroth element.
+  std::vector<bool> matrix_is_used_;
+  NnetComputation *computation_;
+  int32 num_matrices_new_;
+  int32 num_submatrices_new_;
+  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
+                                         // new-matrix-index.  -1 for removed
+                                         // ones.
+  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
+                                            // gives new-submatrix-index.  -1
+                                            // for removed ones.
+};
 
 // static
 int32 ComputationRenumberer::CreateRenumbering(
@@ -276,22 +344,10 @@ void ComputationRenumberer::ComputeMatrixIsUsed() {
   matrix_is_used_.clear();
   matrix_is_used_.resize(computation_->matrices.size(), false);
   matrix_is_used_[0] = true;
-
-  std::vector<int32*> matrix_args;
-  bool include_in_submatrices = false;
-  IdentifyMatrixArgsInComputation(include_in_submatrices,
-                                  computation_, &matrix_args);
-  std::vector<int32*>::iterator iter = matrix_args.begin(),
-      end = matrix_args.end();
-  for (; iter != end; ++iter) {
-    int32 matrix_index = **iter;
-    if (matrix_index > 0)
-      matrix_is_used_[matrix_index] = true;
-  }
   // We also need to take into account when matrices are used indirectly via
   // submatrices (which is actually the main way they are accessed).
-  int32 num_submatrices_orig = computation_->submatrices.size();
-  for (int32 s = 1; s < num_submatrices_orig; s++) {
+  int32 num_submatrices = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
     int32 matrix_index = computation_->submatrices[s].matrix_index;
     if (submatrix_is_used_[s])
       matrix_is_used_[matrix_index] = true;
@@ -355,20 +411,15 @@ void ComputationRenumberer::RenumberSubmatrices() {
 
 void ComputationRenumberer::RenumberMatrices() {
   std::vector<int32*> matrix_args;
-  bool include_in_submatrices = true;
-  IdentifyMatrixArgsInComputation(include_in_submatrices,
-                                  computation_, &matrix_args);
-  std::vector<int32*>::iterator iter = matrix_args.begin(),
-      end = matrix_args.end();
-  for (; iter != end; ++iter) {
-    if (**iter > 0) {
-      int32 new_matrix_index = old_to_new_matrix_[**iter];
-      // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
-      // submatrices that are never accessed, and these should never appear
-      // in this list.
-      KALDI_ASSERT(new_matrix_index > 0);
-      **iter = new_matrix_index;
-    }
+  int32 num_submatrices = computation_->submatrices.size();
+  for (int32 s = 1; s < num_submatrices; s++) {
+    int32 *matrix_index = &(computation_->submatrices[s].matrix_index);
+    // old_to_new_matrix_[s] for s > 0 is only <= 0 (actually, -1) for
+    // submatrices that are never accessed, and these should never appear
+    // in this list.  (presumably because we renumber the submatrices first).
+    int32 new_matrix_index = old_to_new_matrix_[*matrix_index];
+    KALDI_ASSERT(new_matrix_index > 0);
+    *matrix_index = new_matrix_index;
   }
 
   std::vector<NnetComputation::MatrixInfo> new_matrices;
@@ -601,6 +652,7 @@ void RenumberComputation(NnetComputation *computation) {
   renumberer.Renumber();
 }
 
+
 void RemoveNoOps(NnetComputation *computation) {
   std::vector<NnetComputation::Command>::iterator
       input_iter = computation->commands.begin(),
@@ -615,87 +667,12 @@ void RemoveNoOps(NnetComputation *computation) {
   computation->commands.resize(output_iter - computation->commands.begin());
 }
 
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet,
-    int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (nnet.IsOutputNode(network_node)) {
-      // deriv_matrix_index would be an input to the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // value_matrix_index would be an input to the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
-
-/// Wherever matrix orig_matrix_index appears in the output of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInOutput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation) {
-  bool ans = false;
-  int32 num_matrices = computation->matrices.size();
-  KALDI_ASSERT(orig_matrix_index > 0 && orig_matrix_index < num_matrices &&
-               new_matrix_index > 0 && new_matrix_index < num_matrices);
-  unordered_map<int32, std::pair<int32, int32> >::iterator
-      iter = computation->input_output_info.begin(),
-      end = computation->input_output_info.end();
-  for (; iter != end; ++iter) {
-    int32 network_node = iter->first,
-        &value_matrix_index = iter->second.first,
-        &deriv_matrix_index = iter->second.second;
-    if (nnet.IsOutputNode(network_node)) {
-      // value_matrix_index would be an output of the computation.
-      if (value_matrix_index == orig_matrix_index) {
-        value_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    } else {
-      // deriv_matrix_index would be an output of the computation.
-      if (deriv_matrix_index == orig_matrix_index) {
-        // we'd only have derivatives for actual inputs. [note: we also allow
-        // users to provide inputs for component nodes, but these would not have
-        // derivatives.]
-        KALDI_ASSERT(nnet.IsInputNode(network_node));
-        deriv_matrix_index = new_matrix_index;
-        ans = true;
-      }
-    }
-  }
-  return ans;
-}
-
 
 VariableMergingOptimizer::VariableMergingOptimizer(
     const NnetOptimizeOptions &config,
     const Nnet &nnet,
-    const ComputationRequest &request,
     NnetComputation *computation):
-    config_(config), nnet_(nnet), request_(request),
+    config_(config), nnet_(nnet),
     computation_(computation),
     already_called_merge_variables_(false) {
   analyzer_.Init(nnet, *computation);
@@ -714,8 +691,7 @@ bool VariableMergingOptimizer::MergeVariables() {
        command_index++) {
     // This loop looks for pairs of sub-matrix indexes s1,s2 that we could
     // potentially merge into a single variable.
-    const NnetComputation::Command &c =
-        computation_->commands[command_index];
+    const NnetComputation::Command &c = computation_->commands[command_index];
     int32 s1 = -1, s2 = -1;
     if (c.command_type == kMatrixCopy &&
         config_.remove_assignments) {
@@ -747,10 +723,10 @@ bool VariableMergingOptimizer::MergeVariables() {
     if (s1 > 0 && s2 > 0) {
       std::pair<bool,bool> p = MayBeMerged(command_index, s1, s2);
       if (p.first) {
-        DoLeftMerge(command_index, s1, s2);
+        DoMerge(command_index, s1, s2);
         merged = true;
       } else if (p.second) {
-        DoRightMerge(command_index, s1, s2);
+        DoMerge(command_index, s2, s1);
         merged = true;
       }
     }
@@ -800,45 +776,33 @@ void VariableMergingOptimizer::MarkAsDirty(int32 s) {
   }
 }
 
-void VariableMergingOptimizer::DoRightMerge(int32 command_index,
-                                            int32 s1, int32 s2) {
-  // Prevent further optimizations touching s1 or s2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  MarkAsDirty(s1);
-  MarkAsDirty(s2);
-
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m1 to effectively be sub-matrices of
-    // s2 instead (they will refer to m2 as the matrix_index).
-    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m1].begin(),
-        end = matrix_to_submatrix_[m1].end();
+void VariableMergingOptimizer::DoMerge(int32 command_index,
+                                       int32 s_to_keep,
+                                       int32 s_to_discard) {
+  // Prevent further optimizations touching either submatrix (we can try again
+  // in a later round of optimization, with a new instance of this class).
+  MarkAsDirty(s_to_keep);
+  MarkAsDirty(s_to_discard);
+
+  int32 m_to_keep = computation_->submatrices[s_to_keep].matrix_index,
+      m_to_discard = computation_->submatrices[s_to_discard].matrix_index;
+  KALDI_ASSERT(m_to_keep != m_to_discard && m_to_keep > 0 && m_to_discard > 0);
+
+  { // modify submatrices of m_to_discard to effectively be sub-matrices of
+    // s_to_keep instead (they will refer to m_to_keep as the matrix_index).
+    std::vector<int32>::const_iterator iter =
+        matrix_to_submatrix_[m_to_discard].begin(),
+        end = matrix_to_submatrix_[m_to_discard].end();
     for (; iter != end; ++iter) {
       int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m1);
+      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index
+                   == m_to_discard);
       computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s2);
+          GetSubMatrixOfSubMatrix(*computation_, submatrix_index,
+                                  s_to_keep);
     }
   }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m1 was an input, replace it as an input with m2
-  bool replaced = ReplaceInInput(nnet_, m1, m2, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m1].is_input);
-  if (replaced) {  // Remove the command that allocates m2.
-    int32 alloc_command = matrix_accesses[m2].allocate_command;
-    KALDI_ASSERT(alloc_command != -1);
-    computation_->commands[alloc_command].command_type =
-        kNoOperation;
-  }
-  // we keep matrix m2 (so m2 is m_to_keep, m1 is m_to_discard).
-  DoMergeCommon(command_index, m2, m1);
-}
-
-void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
-                                             int32 m_to_keep,
-                                             int32 m_to_discard) {
+
   ComputationAnalysis analysis(*computation_, analyzer_);
   NnetComputation::Command &c = computation_->commands[command_index];
   const std::vector<MatrixAccesses> &matrix_accesses =
@@ -852,52 +816,59 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
     c.arg2 = -1;
   }
 
-  //   - If both m_to_keep and m_to_discard have commands that deallocate them,
-  //    keep only the allocation command for m_to_keep, and make sure it's after
-  //    the last access of m_to_discard (otherwise delete any deallocation
-  //    command).
+  //   We want to ensure that there is only one deallocation command.
+  //   If neither matrix is an output, then there will be 2 deallocation
+  //   commands and we keep the one for m_to_keep (which, if the sizes
+  //   differ, will be the larger of the two, so it's the one whose
+  //   submatrix index refers to the entirety of the matrix).
+  //   If one of them is an output, then remove the deallocation command
+  //   of whichever one is not an output.
+  //   As a simplification to the logic above: if the 'discard' matrix
+  //   has a deallocation command (i.e. if that matrix was not an output)
+  //   then remove it; otherwise remove the deallocation command of
+  //   the 'keep' matrix.
+
   int32 dealloc_keep = matrix_accesses[m_to_keep].deallocate_command,
       dealloc_discard = matrix_accesses[m_to_discard].deallocate_command;
-  if (dealloc_keep != -1 && dealloc_discard != -1) {
-    KALDI_ASSERT(analysis.LastMatrixAccess(m_to_discard) < dealloc_keep);
+  if (dealloc_discard != -1) {
     computation_->commands[dealloc_discard].command_type = kNoOperation;
   } else {
-    if (dealloc_keep != -1)
-      computation_->commands[dealloc_keep].command_type =
-          kNoOperation;
-    if (dealloc_discard != -1)
-      computation_->commands[dealloc_discard].command_type =
-          kNoOperation;
-  }
-
-  //   - If both m_to_keep and m_to_discard have commands that allocate them,
-  //     keep only the allocation command for m_to_keep and make sure it's
-  //     before the first access of m_to_discard.
-  //     (otherwise delete any allocation command).
-  int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
-      alloc_discard = matrix_accesses[m_to_discard].allocate_command;
-  if (alloc_keep != -1 && alloc_discard != -1) {
+    KALDI_ASSERT(dealloc_keep != -1);
+    computation_->commands[dealloc_keep].command_type = kNoOperation;
+  }
+
+  {
+    //   - Both m_to_keep and m_to_discard will have commands that allocate
+    //     them, as all matrices do (note, kAcceptInput counts as an allocation
+    //     command).  If one of them is kAcceptInput, then delete the other one.
+    //     Otherwise delete the "discard" one.  As a simplification of the logic
+    //     of the previous sentence: if the "discard" allocate command is
+    //     kAcceptInput then delete the "keep" allocate command, else delete
+    //     the "discard" allocate command.
+    //     Note: after we renumber the submatrices, they both refer to the
+    //     same underlying matrix, but we need to refer to them using a
+    //     submatrix that refers to the entire matrix.  The one we keep will
+    //     always refer to the entire matrix.  (In the case where one of
+    //     them is an input, both submatrices are guaranteed to refer to the
+    //     entire matrix).
+    int32 alloc_keep = matrix_accesses[m_to_keep].allocate_command,
+        alloc_discard = matrix_accesses[m_to_discard].allocate_command;
+
+    KALDI_ASSERT(alloc_keep != -1 && alloc_discard != -1);
     KALDI_ASSERT(analysis.FirstMatrixAccess(m_to_discard) > alloc_keep);
+
     NnetComputation::Command
         &keep_alloc_command = computation_->commands[alloc_keep],
         &discard_alloc_command = computation_->commands[alloc_discard];
-    discard_alloc_command.command_type = kNoOperation;
-    if (keep_alloc_command.command_type == kAllocMatrixUndefined) {
-      keep_alloc_command.command_type = kAllocMatrixZeroed;
-    } else if (keep_alloc_command.command_type == kAllocMatrixFromOther) {
-      keep_alloc_command.command_type = kAllocMatrixFromOtherZeroed;
+    if (discard_alloc_command.command_type == kAcceptInput) {
+      keep_alloc_command.command_type = kNoOperation;
+    } else {
+      discard_alloc_command.command_type = kNoOperation;
     }
-  } else {
-    if (alloc_keep != -1)
-      computation_->commands[alloc_keep].command_type =
-          kNoOperation;
-    if (alloc_discard != -1)
-      computation_->commands[alloc_discard].command_type =
-          kNoOperation;
   }
 
   //  If the matrix to discard had stride_type == kStrideEqualNumCols, set the
-  //  matrix to keep's stride_type to kStrideEqualNuMCols.
+  //  matrix to keep's stride_type to kStrideEqualNumCols.
   if (computation_->matrices[m_to_discard].stride_type == kStrideEqualNumCols) {
     computation_->matrices[m_to_keep].stride_type = kStrideEqualNumCols;
     // ... and perform an additional check.
@@ -908,43 +879,6 @@ void VariableMergingOptimizer::DoMergeCommon(int32 command_index,
   }
 }
 
-void VariableMergingOptimizer::DoLeftMerge(int32 command_index,
-                                           int32 s1, int32 s2) {
-  // Prevent further optimizations touching s1 or s2 (we can
-  // try again in a later round of optimization, with a new
-  // instance of this class).
-  MarkAsDirty(s1);
-  MarkAsDirty(s2);
-
-  int32 m1 = computation_->submatrices[s1].matrix_index,
-      m2 = computation_->submatrices[s2].matrix_index;
-  KALDI_ASSERT(m1 != m2 && m1 > 0 && m2 > 0);
-  { // modify submatrices for submatrices of m2 to effectively be sub-matrices of
-    // s1 instead (they will refer to m1 as the matrix_index).
-    std::vector<int32>::const_iterator iter = matrix_to_submatrix_[m2].begin(),
-        end = matrix_to_submatrix_[m2].end();
-    for (; iter != end; ++iter) {
-      int32 submatrix_index = *iter;
-      KALDI_ASSERT(computation_->submatrices[submatrix_index].matrix_index==m2);
-      computation_->submatrices[submatrix_index] =
-          GetSubMatrixOfSubMatrix(*computation_, submatrix_index, s1);
-    }
-  }
-  const std::vector<MatrixAccesses> &matrix_accesses = analyzer_.matrix_accesses;
-  // - If m2 was an output, replace it as an input with m1.
-  bool replaced = ReplaceInOutput(nnet_, m2, m1, computation_);
-  KALDI_ASSERT(replaced == matrix_accesses[m2].is_output);
-  if (replaced) {  // Remove the command that deallocates m1.
-    int32 dealloc_command = matrix_accesses[m1].deallocate_command;
-    KALDI_ASSERT(dealloc_command != -1);
-    computation_->commands[dealloc_command].command_type =
-        kNoOperation;
-  }
-  // we keep matrix m1 (so m1 is m_to_keep, m2 is m_to_discard).
-  DoMergeCommon(command_index, m1, m2);
-}
-
-
 
 
 std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
@@ -1015,6 +949,77 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
   return std::pair<bool,bool>(false,false);
 }
 
+
+/** This class is responsible for consolidating the model-update part of
+    backprop commands, for components in (e.g.) recurrent networks that need to
+    have many separate backprop commands, into more efficient single commands
+    operating on consolidated data in larger matrices.  This is useful for
+    recurrent networks.  */
+class ModelUpdateConsolidator {
+ public:
+  ModelUpdateConsolidator(const Nnet &nnet,
+                          NnetComputation *computation);
+  void ConsolidateModelUpdate();
+ private:
+  void ConsolidateUpdateForComponent(
+      int32 component,
+      const std::vector<int32> &backprop_commands);
+
+  /// This function, called at the end of ConsolidateModelUpdate(), takes the
+  /// commands that we have put in extra_commands_, final_commands_ and
+  /// final_deallocate_commands_, and puts them in the appropriate place in
+  /// computation->commands_.
+  void AddCommandsToComputation();
+
+  /// You call this function when you want to consolidate the values of a list
+  /// of submatrices taken just prior to particular commands.  The input
+  /// 'commands' and 'submatrices' lists must be the same size, and size must be
+  /// > 1.  This function will create a new matrix that is the row-wise
+  /// concatentation of all these submatrices, with values taken just prior to
+  /// the respective command indexes.  This function will will add to
+  /// extra_commands_ the commands to do the copying at the appropriate places
+  /// (at the supplied command indexes; they will be inserted just before).  The
+  /// return value is the submatrix index of a submatrix that represents the
+  /// whole of the consolidated matrix.  This command will insert, at the
+  /// beginning of the computation (in extra_commands_[0]), a command to
+  /// initialize the matrix; and will append to final_deallocate_commands_ the
+  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
+  /// nonempty, this function will also update computation_->matrix_debug_info
+  /// with suitable values for the newly added matrix
+  int32 ConsolidateSubmatrices(
+      const std::vector<int32> &commands,
+      const std::vector<int32> &submatrices);
+
+  /// This function, called from ConsolidateSubmatrices, will
+  /// update 'debug_info' by appending the corresponding 'indexes' from
+  /// the existing debug info for this submatrix.  It will also set
+  /// the 'is_deriv' of '*debug_info' to the same value as the
+  /// debug info for 'submatrix_index', and set the 'node_index' to the
+  /// 'node_index' in the debug info for that submatrix-index.
+  /// It requires that computation_->matrix_debug_info be nonempty.
+  void AppendDebugInfoForSubmatrix(
+      int32 submatrix_index,
+      NnetComputation::MatrixDebugInfo *debug_info) const;
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+
+  // Indexed by the original command index in *computation_ (and sized to the
+  // original number of commands in *computation_ before we added anything),
+  // extra_commands_[c] contains a list of commands that need to be inserted
+  // just before command c in the previously existing computation.
+  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
+
+  // This is as list of kBackprop commands that will be placed after the
+  // commands in 'computation_->commands' and 'extra_commands_', but before
+  // the 'final_deallocate_commands_'.
+  std::vector<NnetComputation::Command> final_commands_;
+  // This is a list of commands to deallocate our 'consolidated' matrices; the
+  // commands will be placed after the commands in 'final_commands_'.
+  std::vector<NnetComputation::Command> final_deallocate_commands_;
+};
+
+
 void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
     int32 submatrix_index,
     NnetComputation::MatrixDebugInfo *debug_info) const {
@@ -1038,7 +1043,6 @@ void ModelUpdateConsolidator::AppendDebugInfoForSubmatrix(
                              src_info.cindexes.begin() + row_end);
 }
 
-
 // see comment by declaration in header.
 int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     const std::vector<int32> &commands,
@@ -1067,14 +1071,14 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
   int32 new_whole_submatrix = computation_->NewMatrix(num_rows, num_cols,
                                                       stride_type);
   // Add a command at the very start, to initialize this new matrix.
-  int32 new_matrix_index =
-      computation_->submatrices[new_whole_submatrix].matrix_index;
   // we can later on optimize this zeroed initialization to an undefined
   // initialization.
   extra_commands_[0].push_back(
-      NnetComputation::Command(kAllocMatrixZeroed, new_matrix_index));
+      NnetComputation::Command(kAllocMatrixZeroed, new_whole_submatrix));
   final_deallocate_commands_.push_back(
-      NnetComputation::Command(kDeallocMatrix, new_matrix_index));
+      NnetComputation::Command(kDeallocMatrix, new_whole_submatrix));
+  int32 new_matrix_index =
+      computation_->submatrices[new_whole_submatrix].matrix_index;
   if (!computation_->matrix_debug_info.empty())
     computation_->matrix_debug_info[new_matrix_index].Swap(&debug_info);
 
@@ -1091,7 +1095,7 @@ int32 ModelUpdateConsolidator::ConsolidateSubmatrices(
     // submatrix numbered 'new_submatrix' the contents of the submatrix numbered
     // 'submatrices[i]'.  Note: we hope that a later pass of optimization
     // (VariableMergingOptimization) will remove this redundant copy by
-    // having the operation that created it right directly to the location
+    // having the operation that created it write directly to the location
     // we want it to be.
     NnetComputation::Command c(kMatrixCopy, new_submatrix, submatrices[i]);
     extra_commands_[commands[i]].push_back(c);
@@ -1212,6 +1216,19 @@ void ModelUpdateConsolidator::ConsolidateModelUpdate() {
   AddCommandsToComputation();
 }
 
+
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            NnetComputation *computation) {
+  // This following if-statement is an optimization: if the computation
+  // request(s) had need_model_derivative == false, there would be nothing to
+  // optimize, so don't bother trying.
+  if (!computation->need_model_derivative)
+    return;
+  ModelUpdateConsolidator consolidator(nnet, computation);
+  consolidator.ConsolidateModelUpdate();
+}
+
+
 // inline
 void DerivativeTimeLimiter::GetPruneValues(int32 initial_submatrix,
                                            int32 new_submatrix,
@@ -1295,8 +1312,8 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
         command->arg5 = mapped_output_deriv_submatrix;
         command->arg6 = mapped_input_deriv_submatrix;
       }
-    }
       break;
+    }
     case kMatrixCopy: case kMatrixAdd:
       MapSimpleMatrixCommand(command);
       break;
@@ -1311,6 +1328,7 @@ void DerivativeTimeLimiter::ModifyCommand(NnetComputation::Command *command) {
       MapAddRowRangesCommand(command);
       break;
     }
+    case kAcceptInput: case kProvideOutput:
     case kNoOperation: case kNoOperationMarker:
       break;
     default:
@@ -1333,7 +1351,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     c->command_type = kNoOperation;
     return;
   }
-  // left_prune1 is the nmber of rows pruned away on the left for submatrix1.
+  // left_prune1 is the number of rows pruned away on the left for submatrix1.
   int32 orig_num_rows = computation_->submatrices[submatrix1].num_rows,
       left_prune1, left_prune2, right_prune1, right_prune2;
   GetPruneValues(submatrix1, submatrix1_mapped, &left_prune1, &right_prune1);
@@ -1355,7 +1373,7 @@ void DerivativeTimeLimiter::MapSimpleMatrixCommand(NnetComputation::Command *c)
     } else {
       int32 num_rows = orig_num_rows - left_prune - right_prune;
       // note: the call NewSubMatrix effectively gives us a sub-matrix of a
-      // subm-matrix.
+      // sub-matrix.
       c->arg1 = computation_->NewSubMatrix(submatrix1,
                                            left_prune, num_rows, 0, -1);
       c->arg2 = computation_->NewSubMatrix(submatrix2,
@@ -1565,7 +1583,7 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
       max_deriv_time_ == std::numeric_limits<BaseFloat>::max())
     return;  // nothing to do.
 
-  EnsureMatricesHaveEntireSubmatrices();
+  computation_->GetWholeSubmatrices(&whole_submatrices_);
   ComputeMatrixPruneInfo();
   ComputeSubmatrixMaps();
   ModifyCommands();
@@ -1574,20 +1592,6 @@ void DerivativeTimeLimiter::LimitDerivTimes() {
   RenumberComputation(computation_);
 }
 
-void DerivativeTimeLimiter::EnsureMatricesHaveEntireSubmatrices() {
-  int32 num_matrices = computation_->matrices.size(),
-      num_submatrices = computation_->submatrices.size();
-  entire_submatrix_.clear();
-  entire_submatrix_.resize(num_matrices, -1);
-  entire_submatrix_[0] = 0;
-  for (int32 s = 1; s < num_submatrices; s++)
-    if (computation_->IsWholeMatrix(s))
-      entire_submatrix_[computation_->submatrices[s].matrix_index] = s;
-  for (int32 m = 1; m < num_matrices; m++)
-    if (entire_submatrix_[m] == -1)
-      entire_submatrix_[m] = computation_->NewSubMatrix(m, 0, -1, 0, -1);
-}
-
 void DerivativeTimeLimiter::ComputeMatrixPruneInfo() {
   KALDI_ASSERT(computation_->matrix_debug_info.size() ==
                computation_->matrices.size() &&
@@ -1688,20 +1692,20 @@ void DerivativeTimeLimiter::ModifyCommands() {
 // desired range are never accessed), and false otherwise.
 bool DerivativeTimeLimiter::CanLimitMatrix(const Analyzer &analyzer,
                                            int32 m) const {
-  int32 s_entire = entire_submatrix_[m];  // submatrix consisting of
+  int32 s_whole = whole_submatrices_[m];  // submatrix consisting of
                                                      // all of the matrix.
-  int32 s_mapped = submatrix_map_[s_entire];  // the matrix limited in time.
-  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_entire);
-  std::vector<int32> entire_variables, mapped_variables;
-  analyzer.variables.AppendVariablesForSubmatrix(s_entire,
-                                                 &entire_variables);
+  int32 s_mapped = submatrix_map_[s_whole];  // the matrix limited in time.
+  KALDI_ASSERT(s_mapped != 0 && s_mapped != s_whole);
+  std::vector<int32> whole_variables, mapped_variables;
+  analyzer.variables.AppendVariablesForSubmatrix(s_whole,
+                                                 &whole_variables);
   analyzer.variables.AppendVariablesForSubmatrix(s_mapped,
                                                  &mapped_variables);
-  KALDI_ASSERT(entire_variables.size() > mapped_variables.size());
-  std::vector<int32> excluded_variables(entire_variables.size() -
+  KALDI_ASSERT(whole_variables.size() > mapped_variables.size());
+  std::vector<int32> excluded_variables(whole_variables.size() -
                                         mapped_variables.size());
   std::vector<int32>::iterator end_iter =
-      std::set_difference(entire_variables.begin(), entire_variables.end(),
+      std::set_difference(whole_variables.begin(), whole_variables.end(),
                           mapped_variables.begin(), mapped_variables.end(),
                           excluded_variables.begin());
   KALDI_ASSERT(end_iter == excluded_variables.end());
@@ -1750,15 +1754,24 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
         // rows to the left.
         submat_info.row_offset = new_row_begin;
       } else {
-        // This submatrix is not entirely the kept range of the matrix.
-        // We assume that this submatrix is never accessed directly (as when
-        // we modified the computation we ensured this).  We
-        // give it a valid but stupid size of num-rows=1, num-cols=1, so
-        // that if it ever does get accessed it should produce an error.
-        submat_info.row_offset = 0;
-        submat_info.num_rows = 1;
-        submat_info.col_offset = 0;
-        submat_info.num_cols = 1;
+        // This submatrix is not entirely inside the kept range of the matrix.
+        // We assume that this submatrix is never accessed directly except (if
+        // it was the whole matrix) for in allocation and deallocation commands,
+        // since when we modified the computation we ensured this.
+        if (computation_->IsWholeMatrix(s)) {
+          // If it was the whole matrix then it may be used in allocation and
+          // deallocation commands, so we should modify it to be the whole of the
+          // new matrix, which will have fewer rows than before.
+          submat_info.num_rows = matrix_num_rows;
+        } else {
+          // We believe this matrix should never be used.  We give it a valid
+          // but stupid size of num-rows=1, num-cols=1, so that if it ever does
+          // get accessed it should produce an error.
+          submat_info.row_offset = 0;
+          submat_info.num_rows = 1;
+          submat_info.col_offset = 0;
+          submat_info.num_cols = 1;
+        }
       }
     }
   }
@@ -1785,7 +1798,7 @@ void DerivativeTimeLimiter::LimitMatrices(const std::vector<bool> &will_limit) {
 void DerivativeTimeLimiter::PruneMatrices() {
   Analyzer analyzer;
   analyzer.Init(nnet_, *computation_);
-  KALDI_ASSERT(computation_->matrices.size() == entire_submatrix_.size());
+  KALDI_ASSERT(computation_->matrices.size() == whole_submatrices_.size());
   int32 num_matrices = computation_->matrices.size();
   std::vector<bool> will_limit(num_matrices, false);
   bool will_limit_at_least_one = false;
@@ -1829,6 +1842,7 @@ void DerivativeTimeLimiter::PruneMatrices() {
     LimitMatrices(will_limit);
 }
 
+
 void LimitDerivativeTimes(const Nnet &nnet,
                           int32 min_deriv_time,
                           int32 max_deriv_time,
@@ -1838,5 +1852,1946 @@ void LimitDerivativeTimes(const Nnet &nnet,
   limiter.LimitDerivTimes();
 }
 
+
+/*
+  This helper function, used in ReplaceRowWithMatrixOps, detects
+  when the vector 'indexes' has a 'special structure'.  The special structure
+  is:
+    zero or more -1's, then
+    a consecutive nonempty sequence of nonnegative numbers, e.g. 6 7 8 9 10, then
+    zero or more -1's.
+
+  Note: this function assumes that any negative elements of 'indexes' are -1.
+  If there are elements less than -1, then it is an error, but this function
+  does not thoroughly check for that.  'indexes' is required to be a nonempty
+  vector.
+
+  If 'indexes' has the special structure then this function returns true
+  and sets the following values, which will explain with the following
+  example in mind: 'indexes = [ -1, -1, 5 6 7 8, -1 ]'.
+     - '*first_nonnegative_pos' is set to the number of initial -1's (and also
+       the location of the first nonnegative element): 2 in this case.
+     - '*first_nonnegative_value' is set to the value of the first nonnegative
+       element (5 in this case)
+     - '*num_nonnegative_values' is set to the number of nonnegative values in
+       the sequence (4 in this case).
+  If 'indexes' does not have this special structure, then this function returns
+  false, and the values of '*first_nonnegative_pos',
+  '*first_nonnegative_value' and '*num_nonnegative_indexes' on exit are
+  undefined.
+*/
+static bool IndexesHaveSpecialStructure(const std::vector<int32> &indexes,
+                                        int32 *first_nonnegative_pos,
+                                        int32 *first_nonnegative_value,
+                                        int32 *num_nonnegative_indexes) {
+  KALDI_ASSERT(!indexes.empty());
+  const int32 *indexes_ptr = &(indexes[0]);
+  size_t pos = 0, size = indexes.size();
+
+  // Find the first nonnegative element of 'indexes'.
+  for (; pos < size; ++pos)
+    if (indexes_ptr[pos] >= 0)
+      break;
+  if (pos == size)
+    return false;  // all -1's... should not happen, but not our problem.
+  *first_nonnegative_pos = static_cast<int32>(pos);
+  int32 n = indexes_ptr[pos];
+  *first_nonnegative_value = n;
+  // Find the first element after '*first_nonnegative_index' that isn't
+  // consecutive.
+  for (; pos < size; ++pos,++n)
+    if (indexes_ptr[pos] != n)
+      break;
+
+  *num_nonnegative_indexes = n - *first_nonnegative_value;
+
+  // Check that the remaining values are all <0 (assumed equal to -1, but
+  // checking <0 may be faster as just one instruction).
+  for (; pos < size; ++pos)
+    if (indexes_ptr[pos] >= 0)
+      return false;  // does not have the special structure.
+
+  return true;
+}
+
+
+
+bool ReplaceRowWithMatrixOps(NnetComputation *computation) {
+  bool ans = false;
+  int32 num_commands = computation->commands.size(),
+      num_indexes = computation->indexes.size();
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    // non-const because we'll be changing it.
+    NnetComputation::Command &c = computation->commands[command_index];
+
+    int32 first_nonnegative_pos,
+        first_nonnegative_value,
+        num_nonnegative_indexes;
+    switch (c.command_type) {
+      case kCopyRows: case kAddRows: {
+        int32 indexes_index = c.arg3;
+        KALDI_ASSERT(indexes_index < num_indexes);
+        const std::vector<int32> &indexes = computation->indexes[indexes_index];
+        if (IndexesHaveSpecialStructure(indexes,
+                                        &first_nonnegative_pos,
+                                        &first_nonnegative_value,
+                                        &num_nonnegative_indexes)) {
+          ans = true;
+          c.arg1 = computation->NewSubMatrix(c.arg1, first_nonnegative_pos,
+                                             num_nonnegative_indexes,
+                                             0, -1);
+          c.arg2 = computation->NewSubMatrix(c.arg2, first_nonnegative_value,
+                                             num_nonnegative_indexes,
+                                             0, -1);
+          c.command_type = (c.command_type == kCopyRows ? kMatrixCopy :
+                            kMatrixAdd);
+        }
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  return ans;
+}
+
+
+
+/*
+  This function, used in SnipSingleRowOp(),
+  finds the number of leading, and trailing, negative numbers
+  in a vector of integers.  For instance, if vec is
+    [ -1 -1 2 3 -1 4 5 -1 ]
+  then '*num_leading_negatives' will be set to 2 and '*num_trailing_negatives'
+  will be set to 1.  If all the numbers in 'vec' are all negative, or 'vec' is
+  empty, it is an error and this function will invoke KALDI_ERR.
+*/
+static void FindNumLeadingAndTrailingNegatives(const std::vector<int32> &vec,
+                                               int32 *num_leading_negatives,
+                                               int32 *num_trailing_negatives) {
+  KALDI_ASSERT(!vec.empty());
+  const int32 *begin = &(vec[0]), *ptr = begin, *end = ptr + vec.size();
+  while (ptr != end && *ptr < 0)
+    ptr++;
+  // note regarding error message: we assume all negative numbers are -1, due to
+  // the way this is called, but it only affects how we describe the error.
+  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
+  *num_leading_negatives = ptr - begin;
+  const int32 *ptr2 = end - 1;
+  // the following while loop should terminate before falling off the vector,
+  // because we've established above (in the assertion) that the vector contains
+  // at least one nonnegative number.
+  while (*ptr2 < 0)
+    ptr2--;
+  KALDI_ASSERT(ptr2 >= begin);  // or would be code error.
+  *num_trailing_negatives = end - 1 - ptr2;
+}
+
+// This function, called from SnipRowOps, is called when it encounters commands
+// of type kAddRows; it modifies such commands when the indexes have leading or
+// trailing -1's,h, to make them operate on a smaller submatrix.  It returns
+// true if it made a change, and false otherwise.
+static bool SnipSingleRowOp(NnetComputation *computation,
+                            int32 command_index) {
+  NnetComputation::Command &c = computation->commands[command_index];
+  KALDI_ASSERT(static_cast<size_t>(c.arg3) < computation->indexes.size());
+  const std::vector<int32> &indexes = computation->indexes[c.arg3];
+  int32 num_leading_negatives, num_trailing_negatives;
+  FindNumLeadingAndTrailingNegatives(indexes,
+                                    &num_leading_negatives,
+                                    &num_trailing_negatives);
+  if (num_leading_negatives == 0 && num_trailing_negatives == 0)
+    return false;
+
+  int32 new_num_rows = static_cast<int32>(indexes.size()) -
+      num_leading_negatives - num_trailing_negatives;
+  KALDI_ASSERT(new_num_rows > 0);
+  std::vector<int32> new_indexes(indexes.begin() + num_leading_negatives,
+                                 indexes.begin() + num_leading_negatives +
+                                 new_num_rows);
+  KALDI_ASSERT(new_indexes.back() >= 0);    // TEMP
+  c.arg3 = computation->indexes.size();
+  computation->indexes.push_back(std::vector<int32>());
+  computation->indexes.back().swap(new_indexes);
+  c.arg1 = computation->NewSubMatrix(c.arg1,
+                                     num_leading_negatives, new_num_rows,
+                                     0, -1);
+  if (new_num_rows == 15) {
+    KALDI_LOG << "HERE"; // TEMP
+  }
+  return true;  // made a change.
+}
+
+
+
+/*
+  This function, used in SnipSingleRowOp(), finds the number of leading, and
+  trailing, negative values in a vector of pairs of integers.  In particular,
+  it finds the number of leading and trailing pairs whose .first value is negative
+  (in practice we'll only encounter either (-1,-1) pairs, or pairs of both
+  nonnegative values).
+
+  For instance, if vec is
+    [ (-1,-1) (-1,-1) (80,2) (81,3) (-1,-1) (80,4) (81,5) (-1,-1) ]
+  then '*num_leading_negatives' will be set to 2 and '*num_trailing_negatives'
+  will be set to 1.  If all the .first numbers in 'vec' are all negative, or
+  'vec' is empty, it is an error and this function will invoke KALDI_ERR.
+*/
+static void FindNumLeadingAndTrailingNegatives(
+    const std::vector<std::pair<int32, int32> > &vec,
+    int32 *num_leading_negatives,
+    int32 *num_trailing_negatives) {
+  KALDI_ASSERT(!vec.empty());
+  const std::pair<int32, int32> *begin = &(vec[0]), *ptr = begin,
+      *end = ptr + vec.size();
+  while (ptr != end && ptr->first < 0)
+    ptr++;
+  // note regarding error message: we assume all negative numbers are -1, due to
+  // the way this is called, but it only affects how we describe the error.
+  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
+  *num_leading_negatives = ptr - begin;
+  const std::pair<int32, int32> *ptr2 = end - 1;
+  // the following while loop should terminate before falling off the vector,
+  // because we've established above (in the assertion) that the vector contains
+  // at least one nonnegative number.
+  while (ptr2->first < 0)
+    ptr2--;
+  KALDI_ASSERT(ptr2 != begin);  // would be code error.
+  *num_trailing_negatives = end - 1 - ptr2;
+}
+
+
+// This function, called from SnipRowOps, is called when it encounters commands
+// of type kAddRowsMulti, kAddToRowsMulti, or kCopyToRowsMulti; have leading or
+// trailing (-1,-1) pairs, to make them operate on a smaller submatrix.  It
+// returns true if it made a change, and false otherwise.
+static bool SnipMultiRowOp(NnetComputation *computation,
+                           int32 command_index) {
+  NnetComputation::Command &c = computation->commands[command_index];
+  KALDI_ASSERT(static_cast<size_t>(c.arg2) < computation->indexes_multi.size());
+  const std::vector<std::pair<int32, int32> > &indexes_multi =
+      computation->indexes_multi[c.arg2];
+  int32 num_leading_negatives, num_trailing_negatives;
+  FindNumLeadingAndTrailingNegatives(indexes_multi,
+                                    &num_leading_negatives,
+                                    &num_trailing_negatives);
+  if (num_leading_negatives == 0 && num_trailing_negatives == 0)
+    return false;
+
+  int32 new_num_rows = static_cast<int32>(indexes_multi.size()) -
+      num_leading_negatives - num_trailing_negatives;
+  KALDI_ASSERT(new_num_rows > 0);
+  std::vector<std::pair<int32, int32> > new_indexes_multi(
+      indexes_multi.begin() + num_leading_negatives,
+      indexes_multi.begin() + num_leading_negatives + new_num_rows);
+  c.arg2 = computation->indexes_multi.size();
+  computation->indexes_multi.push_back(std::vector<std::pair<int32, int32> >());
+  computation->indexes_multi.back().swap(new_indexes_multi);
+  c.arg1 = computation->NewSubMatrix(c.arg1,
+                                     num_leading_negatives, new_num_rows,
+                                     0, -1);
+  return true;  // made a change.
+}
+
+
+
+/*
+  This function, used in SnipRangeRowOp(), finds the number of leading and
+  trailing values in a vector of pairs of integers, that are the same (i.e.
+  pairs of the form (x, x) for any x.  [This is how we represent an empty
+  range, which is a kind of no-op, in commands of kCopyRowRanges or
+  kAddRowRanges.
+
+  For instance, if vec is
+    [ (0,0) (0,0) (4,5) (6,8) (0,0) (10,12) (14,20) (0,0) ]
+  then '*num_leading_identicals' will be set to 2 and '*num_trailing_identicals'
+  will be set to 1.  If all pairs in 'vec' are identical, or 'vec' is empty, it
+  is an error and this function will invoke KALDI_ERR.
+*/
+static void FindNumLeadingAndTrailingIdenticals(
+    const std::vector<std::pair<int32, int32> > &vec,
+    int32 *num_leading_identicals,
+    int32 *num_trailing_identicals) {
+  KALDI_ASSERT(!vec.empty());
+  const std::pair<int32, int32> *begin = &(vec[0]), *ptr = begin,
+      *end = ptr + vec.size();
+  while (ptr != end && ptr->first == ptr->second)
+    ptr++;
+  // note regarding error message: we assume all negative numbers are -1, due to
+  // the way this is called, but it only affects how we describe the error.
+  KALDI_ASSERT(ptr != end && "Vector consists entirely of -1's.");
+  *num_leading_identicals = ptr - begin;
+  const std::pair<int32, int32> *ptr2 = end - 1;
+  // the following while loop should terminate before falling off the vector,
+  // because we've established above (in the assertion) that the vector contains
+  // at least one nonnegative number.
+  while (ptr2->first == ptr2->second)
+    ptr2--;
+  KALDI_ASSERT(ptr2 != begin);  // would be code error.
+  *num_trailing_identicals = end - 1 - ptr2;
+}
+
+
+// This function, called from SnipRowOps, is called when it encounters commands
+// of type kAddRowRanges that have leading or trailing (x, x) pairs [i.e. pairs
+// of identical values; these are how we represent empty ranges], to make them
+// operate on a smaller submatrix.  It returns true if it made a change, and
+// false otherwise.
+static bool SnipRangesRowOp(NnetComputation *computation,
+                            int32 command_index) {
+  NnetComputation::Command &c = computation->commands[command_index];
+  KALDI_ASSERT(static_cast<size_t>(c.arg3) < computation->indexes_ranges.size());
+  const std::vector<std::pair<int32, int32> > &indexes_ranges =
+      computation->indexes_ranges[c.arg3];
+  int32 num_leading_identicals, num_trailing_identicals;
+  FindNumLeadingAndTrailingIdenticals(indexes_ranges,
+                                    &num_leading_identicals,
+                                    &num_trailing_identicals);
+  if (num_leading_identicals == 0 && num_trailing_identicals == 0)
+    return false;
+
+  int32 new_num_rows = static_cast<int32>(indexes_ranges.size()) -
+      num_leading_identicals - num_trailing_identicals;
+  KALDI_ASSERT(new_num_rows > 0);
+  std::vector<std::pair<int32, int32> > new_indexes_ranges(
+      indexes_ranges.begin() + num_leading_identicals,
+      indexes_ranges.begin() + num_leading_identicals + new_num_rows);
+  c.arg3 = computation->indexes_ranges.size();
+  computation->indexes_ranges.push_back(std::vector<std::pair<int32, int32> >());
+  computation->indexes_ranges.back().swap(new_indexes_ranges);
+  c.arg1 = computation->NewSubMatrix(c.arg1,
+                                     num_leading_identicals, new_num_rows,
+                                     0, -1);
+  return true;  // made a change.
+}
+
+
+
+bool SnipRowOps(NnetComputation *computation) {
+  bool ans = false;
+  int32 num_commands = computation->commands.size();
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    // non-const because we'll be changing it.
+    NnetComputation::Command &c = computation->commands[command_index];
+
+    // note: we can't do the snipping for commands of type case kCopyRows and case
+    // kCopyRowsMulti, because the -1's aren't a pure no-op; they have the
+    // meaning of setting the destination value to zero, so we can't prune
+    // them away.
+
+    switch (c.command_type) {
+      case kAddRows: {
+        if (SnipSingleRowOp(computation, command_index))
+          ans = true;
+        break;
+      }
+      case kAddRowsMulti: case kAddToRowsMulti:
+      case kCopyToRowsMulti: {
+        if (SnipMultiRowOp(computation, command_index))
+          ans = true;
+        break;
+      }
+      case kAddRowRanges: {
+        if (SnipRangesRowOp(computation, command_index))
+          ans = true;
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  return ans;
+}
+
+
+
+// This class implements the internals of the ExpandComputation() function (used
+// in shortcut compilation); see comment by the declaration of
+// ExpandComputation() in nnet-optimize-utils.h for overview.
+class ComputationExpander {
+ public:
+  ComputationExpander(const Nnet &nnet,
+                      const MiscComputationInfo &misc_info,
+                      const NnetComputation &computation,
+                      bool need_debug_info,
+                      int32 num_n_values,
+                      NnetComputation *expanded_computation):
+      nnet_(nnet), misc_info_(misc_info),
+      computation_(computation),
+      need_debug_info_(need_debug_info),
+      num_n_values_(num_n_values),
+      expanded_computation_(expanded_computation) {
+    KALDI_ASSERT(num_n_values > 2);
+  }
+
+  // This function call implements the functionality of the class,
+  // expanding the computation.
+  void Expand();
+
+ private:
+  // This function sets up and computes the 'n_fast_' vector (see comment
+  // by the declaration of 'n_fast_' for what this is.
+  void InitFastInfo();
+
+  // This function sets up the 'matrices' vector in 'expanded_computation_'.
+  // It's quite simple: it just multiplies all the num-rows by num_n_values_ and
+  // divides by 2, and leaves the num-cols the same.
+  void ComputeMatrixInfo();
+
+  // This function, only called if need_debug_info_ is true, sets up
+  // the 'matrix_debug_info' vector in 'expanded_computation_'.
+  void ComputeDebugInfo();
+
+  // This function sets up the 'submatrices' vector in 'expanded_computation_'.
+  // Column ranges always stay the same, but for row ranges it's a little
+  // more complicated.
+  void ComputeSubmatrixInfo();
+
+  // Expands a command of type kCopyRows or kAddRows; involves adding a new
+  // element of 'indexes' to expanded_computation_.
+  void ExpandRowsCommand(const NnetComputation::Command &c_in,
+                         NnetComputation::Command *c_out);
+
+  // Expands a command of type kCopyRowsMulti or kAddRowsMulti, kCopyToRowsMulti
+  // or kAddToRowsMulti; involves adding a new element of 'indexes_multi' to
+  // expanded_computation_.
+  void ExpandRowsMultiCommand(const NnetComputation::Command &c_in,
+                              NnetComputation::Command *c_out);
+
+
+  // Expands a command of type kAddRowRanges; involves adding a new element of
+  // 'indexes_ranges' to expanded_computation_.
+  void ExpandRowRangesCommand(const NnetComputation::Command &c_in,
+                              NnetComputation::Command *c_out);
+
+
+  // This function computes all the PrecomputedIndexes in the
+  // 'component_precomputed_indexes' member of 'expanded_computation_'.
+  // They are all generated from scratch, by using the Component::PrecomputedIndexes()
+  // member function.  The 'input_indexes' and 'output_indexes' arguments are worked
+  // out from the 'debug_info' [if we're not generating debug_info we specially generate
+  // it for the specific matrices in question], and the 'need_backprop'
+  // argument is worked out by seeing whether there is a call to Backprop() with
+  // the same precomputed-indexes element.
+  void ComputePrecomputedIndexes();
+
+  // Computes the 'commands' member of the output.  This function also adds as
+  // needed to 'indexes', 'indexes_multi' and 'indexes_ranges' in the output.
+  // Later on we can call RenumberComputation() to remove any duplicates that
+  // might result from this.
+  void ComputeCommands();
+
+
+  // This command ensure that the debug-info in expanded_computation_ for the
+  // matrix underlying the submatrix with index 'submatrix_index', exists and is
+  // set up.  In some cases we need the debug info for some matrices in order to
+  // do the expansion, even if debug info is not requested for the output; in
+  // those cases we set it up temporarily and clear it before we finish.
+  void EnsureDebugInfoExists(int32 submatrix_index);
+
+
+
+  // This function is used in mapping row-indexes into sub-matrices from the
+  // old to the new computation.  It is mostly a wrapper for
+  // GetNewMatrixLocationInfo, but designed to give row indexes into
+  // submatrices rather than matrices; see the documentation for
+  // GetNewMatrixLocationinfo() for details and an explanation of the
+  // interface.
+  // This function assumes that ComputeSubmatrixInfo() has already
+  // been called.
+  // Note: it returns true if the index 'old_row_index' into submatrix
+  // indexed 'submat_index' corresponds to an Index with n=0; otherwise
+  // it returns false and does not set the output values.
+  bool GetNewSubmatLocationInfo(int32 submat_index,
+                                int32 old_row_index,
+                                int32 *new_row_index,
+                                int32 *new_n_stride) const;
+
+
+  /// This function is used in mapping row-indexes into matrices, from the
+  /// old to the new computation.
+  ///     @param [in] old_matrix_index   The matrix-index > 0, for which we
+  ///                                    are mapping row-indexes.
+  ///     @param [in] old_row_index   The old row-index into the matrix.
+  ///                            This MUST be a row-index for which n=0
+  ///                            in the cindexes information.
+  ///     @param [out] new_row_index  To '*new_row_index' this funtion outputs
+  ///                            the row-index where the cindex referred to in
+  ///                            'old_matrix_index' will reside in the new,
+  ///                            expanded computation.
+  ///     @param [out] new_n_stride   To '*new_n_stride' this function outputs
+  ///                            the 'n stride' in the new computation, which
+  ///                            means the amount the row-index increases
+  ///                            every time we increase the 'n' value in the
+  ///                            cindex by one.
+  void GetNewMatrixLocationInfo(int32 old_matrix_index,
+                                int32 old_row_index,
+                                int32 *new_row_index,
+                                int32 *new_n_stride) const;
+
+
+
+  // This function 'expands' a set of indexes; it's called from
+  // ComputePrecomputedIndexes().  The indexes are expected to
+  // have the normal kind of regularity, with the 'n' varying either
+  // the fastest or the slowest of any index.
+  void ExpandIndexes(const std::vector<Index> &indexes,
+                     std::vector<Index> *indexes_expanded) const;
+
+
+
+  // This function, used in ExpandIndexes(), works out whether a vector
+  // of indexes varies 'fast' in n, or slowly; see the comment for
+  // ComputationIsDecomposable() in nnet-optimize-utils.h for more explanation
+  // of the meaning.
+  // If the vector of indexes does not have the required regular structure w.r.t
+  // n, this function will throw an exception via KALDI_ERR.
+  bool GetFastInfo(const std::vector<Index> &indexes) const;
+
+  /// This function is analogous to GetNewMatrixLocationInfo, but
+  /// specialized for the case where you have a vector of Indexes
+  /// It's used inside ExpandIndexes().
+  ///
+  ///  @param [in] 'is_fast' should be true if the 'n' varies fast in the input
+  ///               indexes (i.e. n stride is 1)...
+  ///  @param [in] old_index The index into 'indexes'.. should point to an
+  ///                         element with n==0 (note, the element is an Index;
+  ///                         and note the capital I, it affects the meaning).
+  ///  @param [out] new_index  The index into the expanded indexes vector
+  ///                         that this same Index will be located at in the
+  ///                         expanded computation.
+  ///  @param [out] new_n_stride  The stride of n, i.e. the amount by which the
+  ///                          index changes when we increment n by one in the
+  ///                          Index.  This will actually be the same as in
+  ///                          the old computation.
+  void GetNewLocationInfo(const std::vector<Index> &indexes,
+                          bool is_fast,
+                          int32 old_index,
+                          int32 *new_index,
+                          int32 *new_n_stride) const;
+
+
+  // This 'n_fast_' vector is indexed by the matrix-index in the computation,
+  // i.e. the same index as indexes computation_.matrix_info and
+  // expanded_computation_->matrix_info.  For each matrix-index m > 0 it
+  // contains true if the 'n' varies 'fast', or false if the 'n' index varies
+  // 'slowly'.  By 'fast' and 'slow', we mean in the same sense as is desribed
+  // in the comment for ComputationIsDecomposable() in nnet-optimize-utils.h.
+  std::vector<bool> n_fast_;
+
+  const Nnet &nnet_;
+  const MiscComputationInfo &misc_info_;
+  const NnetComputation &computation_;
+  bool need_debug_info_;
+  int32 num_n_values_;
+  NnetComputation *expanded_computation_;
+};
+
+
+
+void ComputationExpander::ExpandRowsCommand(
+    const NnetComputation::Command &c_in,
+    NnetComputation::Command *c_out) {
+  // we need to expand the row-indexes in c_in.arg3, and put the index of the
+  // resulting vector<int> in expanded_computation_->indexes, in 'c_out->arg3'.
+
+  int32 s1 = c_in.arg1, s2 = c_in.arg2;
+
+  // The command that gets called is something like
+  // submat1.AddRows(submat2, indexes) if submat1 is the submatrix referred to in
+  // 's1' and submat2 is the submatrix referred to in 's2'.
+  // 'indexes' has the same size as the num-rows of submat1, and the values
+  // in the vector are row-indexes into s2.
+  int32 old_arg3 = c_out->arg3;
+  c_out->arg3 = expanded_computation_->indexes.size();
+  expanded_computation_->indexes.push_back(std::vector<int32>());
+  std::vector<int32> &new_indexes = expanded_computation_->indexes.back();
+  const std::vector<int32> &old_indexes = computation_.indexes[old_arg3];
+
+  int32 old_size = old_indexes.size(),
+      num_n_values = num_n_values_,
+      new_s1_size = expanded_computation_->submatrices[s1].num_rows,
+      new_s2_size = expanded_computation_->submatrices[s2].num_rows;
+
+  KALDI_ASSERT(old_size == computation_.submatrices[s1].num_rows);
+
+  new_indexes.resize(new_s1_size, -1);
+
+  for (int32 i1 = 0; i1 < old_size; i1++) {
+    int32 new_i1_n0, new_n_stride1;
+    if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
+      // GetNewSubmatLocationInfo() returns true if this corresponds to
+      // a Cindex with n == 0.
+      int32 i2 = old_indexes[i1];  // note: i2 is the row index into submatrix s2.
+      int32 new_i2_n0, new_n_stride2;
+      if (i2 < 0) {  // if i2 is -1, we'll just fill any relevant positions in
+                     // 'new_indexes' with -1's.
+        continue;
+      } else {
+        bool ans = GetNewSubmatLocationInfo(s2, i2, &new_i2_n0, &new_n_stride2);
+        KALDI_ASSERT(ans);  // source should also be for n==0, because we don't
+                            // (or at least shouldn't) create computations that
+                            // mix up the 'n' values
+
+        int32 new_i1 = new_i1_n0, new_i2 = new_i2_n0;
+        for (int32 n = 0; n < num_n_values;
+             ++n, new_i1 += new_n_stride1, new_i2 += new_n_stride2) {
+          KALDI_ASSERT(new_i1 < new_s1_size && new_i2 < new_s2_size);
+          new_indexes[new_i1] = new_i2;
+        }
+      }
+    }
+  }
+}
+
+void ComputationExpander::ExpandRowsMultiCommand(
+    const NnetComputation::Command &c_in,
+    NnetComputation::Command *c_out) {
+  // we need to expand the (submatrix,row)-index pairs in c_in.arg2, and put the
+  // index of the resulting vector<int> in expanded_computation_->indexes_multi,
+  // in 'c_out->arg2'.
+
+  int32 s1 = c_in.arg1,
+      num_rows_old = computation_.submatrices[s1].num_rows,
+      num_rows_new = expanded_computation_->submatrices[s1].num_rows;
+
+  KALDI_ASSERT(num_rows_old % 2 == 0);
+  int32 num_n_values = num_n_values_;
+
+  int32 old_arg2 = c_out->arg2;
+  c_out->arg2 = expanded_computation_->indexes_multi.size();
+  expanded_computation_->indexes_multi.push_back(
+      std::vector<std::pair<int32, int32> >());
+  std::vector<std::pair<int32, int32> > &new_indexes_multi =
+      expanded_computation_->indexes_multi.back();
+  const std::vector<std::pair<int32, int32> > &old_indexes_multi =
+      computation_.indexes_multi[old_arg2];
+  // old_indexes_multi is a vector that has the same size as the num-rows
+  // of submatrix s1.  It contains pairs that are either (-1, -1), or
+  // pairs (submatrix-index, row-index) referring to other submatrices
+  // in the computation.
+
+  KALDI_ASSERT(static_cast<int32>(old_indexes_multi.size()) == num_rows_old);
+
+
+  new_indexes_multi.resize(num_rows_new,
+                           std::pair<int32,int32>(-1, -1));
+
+  for (int32 i1 = 0; i1 < num_rows_old; i1++) {
+    int32 new_i1_n0, new_n_stride1;
+    if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
+      // GetNewSubmatLocationInfo() returns true if this corresponds to
+      // a Cindex with n == 0.
+      int32 s2 = old_indexes_multi[i1].first,
+          i2 = old_indexes_multi[i1].second;
+      int32 new_i2_n0, new_n_stride2;
+      if (s2 < 0) {  // if s2 is -1, we don't have to do anything... we'd have
+                     // to fill any relevant positions in 'new_indexes_multi'
+                     // with (-1,-1)'s, but it's filled with that by default.
+        continue;
+      } else {
+        bool ans = GetNewSubmatLocationInfo(s2, i2, &new_i2_n0, &new_n_stride2);
+        KALDI_ASSERT(ans);  // source should also be for n==0, because we don't
+                            // (or at least shouldn't) create computations that
+                            // mix up the 'n' values
+
+        int32 new_i1 = new_i1_n0, new_i2 = new_i2_n0;
+
+        for (int32 n = 0; n < num_n_values;
+             n++, new_i1 += new_n_stride1, new_i2 += new_n_stride2) {
+          new_indexes_multi[new_i1].first = s2;
+          new_indexes_multi[new_i1].second = new_i2;
+        }
+      }
+    }
+  }
+}
+
+
+
+void ComputationExpander::ExpandRowRangesCommand(
+    const NnetComputation::Command &c_in,
+    NnetComputation::Command *c_out) {
+  // we need to expand the pairs of row-indexes in c_in.arg2, and put the index
+  // of the resulting vector<int> in expanded_computation_->indexes_ranges, in
+  // 'c_out->arg2'.
+
+  int32 s1 = c_in.arg1, s2 = c_in.arg2,
+      num_rows_old = computation_.submatrices[s1].num_rows,
+      num_rows_new = expanded_computation_->submatrices[s1].num_rows;
+  KALDI_ASSERT(static_cast<size_t>(c_in.arg3) <
+               computation_.indexes_ranges.size());
+  KALDI_ASSERT(num_rows_old % 2 == 0);
+  int32 num_n_values = num_n_values_;
+
+
+  int32 old_arg3 = c_out->arg3;
+  c_out->arg3 = expanded_computation_->indexes_ranges.size();
+  expanded_computation_->indexes_ranges.push_back(
+      std::vector<std::pair<int32, int32> >());
+  std::vector<std::pair<int32, int32> > &new_indexes_ranges =
+      expanded_computation_->indexes_ranges.back();
+  const std::vector<std::pair<int32, int32> > &old_indexes_ranges =
+      computation_.indexes_ranges[old_arg3];
+  // old_indexes_ranges is a vector that has the same size as the num-rows of
+  // submatrix s1.  It contains pairs that are either two copies of the same
+  // value (in practice the pair (-1, -1)), or pairs (begin-row-index,
+  // end-row-index) representing the (begin,end) of a range in submatrix s2.
+  // Note: end-row-index is one past the end of the range, as for C++ iterators.
+
+  KALDI_ASSERT(static_cast<int32>(old_indexes_ranges.size()) == num_rows_old);
+
+  new_indexes_ranges.resize(num_rows_new,
+                           std::pair<int32,int32>(-1, -1));
+
+  for (int32 i1 = 0; i1 < num_rows_old; i1++) {
+    int32 new_i1_n0, new_n_stride1;
+    if (GetNewSubmatLocationInfo(s1, i1, &new_i1_n0, &new_n_stride1)) {
+      // GetNewSubmatLocationInfo() returns true if this corresponds to
+      // a Cindex with n == 0.
+      int32 i2_begin = old_indexes_ranges[i1].first,
+          i2_end = old_indexes_ranges[i1].second;
+      if (i2_end == i2_begin)
+        continue;  // (-1, -1) pair, meaning an empty range.
+                   // 'new_indexes_ranges' is filled with (-1, -1) pairs as a
+                   // default so we don't have to do anything for these
+                   // elements.
+      int32 i2_last = i2_end - 1;
+      int32 new_i2_n0_begin, new_i2_n0_last,
+          new_n_stride2;  // only 1 stride variable; both calls will output
+                          // the same value.
+
+      bool ans1 = GetNewSubmatLocationInfo(s2, i2_begin, &new_i2_n0_begin,
+                                           &new_n_stride2),
+          ans2 = GetNewSubmatLocationInfo(s2, i2_last, &new_i2_n0_last,
+                                          &new_n_stride2);
+      KALDI_ASSERT(ans1 && ans2 && new_i2_n0_last >= new_i2_n0_begin &&
+                   new_i2_n0_begin >= 0);
+      // source should also be for n==0, because we don't (or at least
+      // shouldn't) create computations that mix up the 'n' values
+
+
+      int32 new_i1 = new_i1_n0,
+          new_i2_begin = new_i2_n0_begin,
+          new_i2_end = new_i2_n0_last + 1;
+      for (int32 n = 0; n < num_n_values;
+           n++, new_i1 += new_n_stride1, new_i2_begin += new_n_stride2,
+               new_i2_end += new_n_stride2) {
+        new_indexes_ranges[new_i1].first = new_i2_begin;
+        new_indexes_ranges[new_i1].second = new_i2_end;
+      }
+    }
+  }
+}
+
+
+
+void ComputationExpander::ComputeCommands() {
+  int32 num_commands = computation_.commands.size();
+  expanded_computation_->commands.resize(num_commands);
+  for (int32 command_index = 0; command_index < num_commands;
+       command_index++) {
+    const NnetComputation::Command &c = computation_.commands[command_index];
+    NnetComputation::Command &c_out =
+        expanded_computation_->commands[command_index];
+    c_out = c;
+    // Commands that only operate on submatrices, components and
+    // precomputed-indexes do not have to be changed because we'll take care of
+    // the expansion by suitably redefining the matrices and submatrices, and
+    // recreating the precomputed-indexes.
+    // However, commands that require, 'indexes', 'indexes_multi' or
+    // 'indexes_ranges' do need to be modified.
+    switch (c.command_type) {
+      case kAllocMatrixUndefined: case kAllocMatrixZeroed:
+      case kDeallocMatrix: case kAllocMatrixFromOther:
+      case kAllocMatrixFromOtherZeroed:
+      case kPropagate: case kStoreStats: case kBackprop:
+      case kBackpropNoModelUpdate: case kMatrixCopy: case kMatrixAdd:
+        break;
+      case kCopyRows: case kAddRows:
+        ExpandRowsCommand(c, &c_out);
+        break;
+      case kCopyRowsMulti: case kAddRowsMulti:
+      case kCopyToRowsMulti: case kAddToRowsMulti:
+        ExpandRowsMultiCommand(c, &c_out);
+        break;
+      case kAddRowRanges:
+        ExpandRowRangesCommand(c, &c_out);
+        break;
+      case kAcceptInput: case kProvideOutput: case kNoOperation:
+      case kNoOperationMarker: case kNoOperationLabel: case kGotoLabel:
+        break;
+      default:
+        KALDI_ERR << "Un-handled command type";
+    }
+  }
+}
+
+
+
+
+void ComputationExpander::InitFastInfo() {
+  // note: the zeroth matrix is not a real matrix, it's the empty matrix.
+  int32 num_matrices = computation_.matrices.size();
+  n_fast_.resize(num_matrices);
+
+  // the input computation to class ComputationExpander is required to
+  // have its debug info set up.
+  KALDI_ASSERT(!computation_.matrix_debug_info.empty());
+  for (int32 m = 1; m < num_matrices; m++) {
+    int32 num_rows = computation_.matrices[m].num_rows;
+    // num-rows should be a multiple of 2 because we assume the input computation
+    // was built for 2 n-values, and has a symmetry where it's doing the same
+    // computation for each n values.
+    KALDI_ASSERT(num_rows % 2 == 0);
+    const NnetComputation::MatrixDebugInfo &debug_info = computation_.matrix_debug_info[m];
+    KALDI_ASSERT(debug_info.cindexes.size() == num_rows);
+    // We require that the 'n' values be in order, which implies that the first
+    // 'n' value be zero.
+    KALDI_ASSERT(debug_info.cindexes[0].second.n == 0);
+    bool is_fast = (debug_info.cindexes[1].second.n == 1);
+    n_fast_[m] = is_fast;
+
+    bool do_check = (RandInt(0, 2) == 0);
+    if (do_check) {
+      // n_stride is the expected difference in row-index between successive
+      // values of 'n' for otherwise identical cindexes.
+      int32 n_stride = (is_fast ? 1 : num_rows / 2);
+      // 'increment' would be 1 if we were checking everything; we do a partial
+      // check, for speed.
+      int32 increment = RandInt(1, 10);
+      for (int32 i = 0; i + n_stride < num_rows; i += increment) {
+        const Cindex &this_cindex = debug_info.cindexes[i],
+            &next_cindex = debug_info.cindexes[i + n_stride];
+        if (this_cindex.second.n == 0) {
+          if (!(next_cindex.first == this_cindex.first &&
+                next_cindex.second.n == 1 &&
+                next_cindex.second.t == this_cindex.second.t &&
+                next_cindex.second.x == this_cindex.second.x)) {
+            KALDI_ERR << "Problem encountered in 'shortcut' compilation: the computation "
+                      << "does not have the expected structure.  Try compiling with "
+                      << "--use-shortcut=false.";
+          }
+        }
+      }
+    }
+  }
+}
+
+
+bool ComputationExpander::GetFastInfo(const std::vector<Index> &indexes) const {
+  KALDI_ASSERT(!indexes.empty());
+  int32 num_rows = indexes.size();
+  // num-rows should be a multiple of 2 because we assume the input computation
+  // was built for 2 n-values, and has a symmetry where it's doing the same
+  // computation for each n values.
+  KALDI_ASSERT(num_rows % 2 == 0);
+
+  KALDI_ASSERT(indexes[0].n == 0);
+  bool is_fast = (indexes[1].n == 1);
+  bool do_check = (RandInt(0, 1) == 0);
+
+  if (do_check) {
+    // n_stride is the expected difference in row-index between successive
+    // values of 'n' for otherwise identical cindexes.
+    int32 n_stride = (is_fast ? 1 : num_rows / 2);
+    // 'increment' would be 1 if we were checking everything; we do a partial
+    // check, for speed.
+    int32 increment = RandInt(1, 5);
+    for (int32 i = 0; i + n_stride < num_rows; i += increment) {
+      const Index &this_index = indexes[i], &next_index = indexes[i + n_stride];
+      if (this_index.n == 0) {
+        if (!(next_index.n == 1 && next_index.t == this_index.t &&
+              next_index.x == this_index.x)) {
+          KALDI_ERR << "Problem encountered in 'shortcut' compilation: the computation "
+                    << "does not have the expected structure.  Try compiling with "
+                    << "--use-shortcut=false.";
+        }
+      }
+    }
+  }
+  return is_fast;
+}
+
+
+void ComputationExpander::Expand() {
+  InitFastInfo();
+  ComputeMatrixInfo();
+  if (need_debug_info_)
+    ComputeDebugInfo();
+  else
+    expanded_computation_->matrix_debug_info.clear();
+  ComputeSubmatrixInfo();
+  ComputePrecomputedIndexes();
+  ComputeCommands();
+
+  expanded_computation_->need_model_derivative =
+      computation_.need_model_derivative;
+}
+
+void ComputationExpander::ComputeMatrixInfo() {
+  int32 num_matrices = computation_.matrices.size();
+  expanded_computation_->matrices.resize(num_matrices);
+  // Matrix zero is a special case; it's the empty matrix.
+  expanded_computation_->matrices[0] = computation_.matrices[0];
+  for (int32 m = 1; m < num_matrices; m++) {
+    expanded_computation_->matrices[m] = computation_.matrices[m];
+    expanded_computation_->matrices[m].num_rows =
+        (computation_.matrices[m].num_rows / 2) * num_n_values_;
+  }
+}
+
+void ComputationExpander::ComputeDebugInfo() {
+  int32 num_matrices = computation_.matrices.size();
+  KALDI_ASSERT(computation_.matrix_debug_info.size() == num_matrices);
+  expanded_computation_->matrix_debug_info.resize(num_matrices);
+  // Matrix zero is a special case; it's the empty matrix.
+  expanded_computation_->matrix_debug_info[0] =
+      computation_.matrix_debug_info[0];
+  int32 num_n_values = num_n_values_;
+  for (int32 m = 1; m < num_matrices; m++) {
+    const NnetComputation::MatrixDebugInfo &info_in =
+        computation_.matrix_debug_info[m];
+    NnetComputation::MatrixDebugInfo &info_out =
+        expanded_computation_->matrix_debug_info[m];
+    info_out.is_deriv = info_in.is_deriv;
+    int32 num_rows_in = computation_.matrices[m].num_rows,
+        num_rows_out = expanded_computation_->matrices[m].num_rows;
+    KALDI_ASSERT(num_rows_in == info_in.cindexes.size());
+    info_out.cindexes.resize(num_rows_out);
+    const Cindex *cindexes_in = &(info_in.cindexes[0]);
+    Cindex *cindexes_out = &(info_out.cindexes[0]);
+    for (int32 r = 0; r < num_rows_in; r++) {
+      if (info_in.cindexes[r].second.n == 0) {
+        int32 new_r, new_n_stride;
+        GetNewMatrixLocationInfo(m, r, &new_r, &new_n_stride);
+        for (int32 n = 0; n < num_n_values; n++) {
+          int32 r_out = new_r + n * new_n_stride;
+          cindexes_out[r_out] = cindexes_in[r];
+          cindexes_out[r_out].second.n = n;
+        }
+      }
+    }
+  }
+}
+
+void ComputationExpander::ComputeSubmatrixInfo() {
+  int32 num_submatrices = computation_.submatrices.size();
+  expanded_computation_->submatrices.resize(num_submatrices);
+  // Sub-matrix zero is a special case; it's the empty submatrix.
+  expanded_computation_->submatrices[0] = computation_.submatrices[0];
+  for (int32 s = 1; s < num_submatrices; s++) {
+    const NnetComputation::SubMatrixInfo &info_in = computation_.submatrices[s];
+    int32 m = info_in.matrix_index;
+    const NnetComputation::MatrixDebugInfo &debug_info_in =
+        computation_.matrix_debug_info[m];
+
+
+    int32 old_n_stride =
+        (n_fast_[m] ? 1 : computation_.matrices[m].num_rows / 2);
+
+     // we may need to change the row_offset and num_rows.
+     int32 first_row_in = info_in.row_offset,
+         last_row_in = first_row_in + info_in.num_rows - 1,
+         last_row_in_n0 = last_row_in - old_n_stride;
+     KALDI_ASSERT(debug_info_in.cindexes[first_row_in].second.n == 0 &&
+                  debug_info_in.cindexes[last_row_in].second.n == 1 &&
+                  debug_info_in.cindexes[last_row_in_n0].second.n == 0);
+     // the function GetNewMatrixLocationInfo() only works for rows that
+     // correspond to n == 0, so we work out a location that's otherwise similar
+     // to the last row but has n == 0, get the 'new' location for that, and
+     // convert to n == (num_n_values_ - 1).
+     int32 first_row_out, last_row_out_n0, new_n_stride;
+     GetNewMatrixLocationInfo(m, first_row_in,
+                              &first_row_out, &new_n_stride);
+     GetNewMatrixLocationInfo(m, last_row_in_n0,
+                              &last_row_out_n0, &new_n_stride);
+     int32 last_row_out = last_row_out_n0 + (new_n_stride * (num_n_values_ - 1)),
+         new_num_rows = (last_row_out + 1 - first_row_out);
+     KALDI_ASSERT(new_num_rows >= info_in.num_rows);
+
+    NnetComputation::SubMatrixInfo &info_out =
+        expanded_computation_->submatrices[s];
+    info_out.matrix_index = m;
+    info_out.row_offset = first_row_out;
+    info_out.num_rows = new_num_rows;
+    info_out.col_offset = info_in.col_offset;
+    info_out.num_cols = info_in.num_cols;
+  }
+}
+
+void ComputationExpander::ComputePrecomputedIndexes() {
+  // for each element of 'component_precomputed_indexes',
+  // we will try to work out the command-index of the associated
+  // Propagate() command and of the associated Backprop() command,
+  // if it exists.
+  // We expect that each such element will be associated with
+  // exactly one Propagate() command and at most one Backprop() command.
+  int32 num_commands = computation_.commands.size(),
+    num_precomputed_indexes = computation_.component_precomputed_indexes.size();
+
+  std::vector<bool> need_backprop(num_precomputed_indexes, false);
+
+  std::vector<int32> component_index(num_precomputed_indexes, -1);
+
+  for (int32 command_index = 0; command_index < num_commands; command_index++) {
+    const NnetComputation::Command &c = computation_.commands[command_index];
+
+    if (c.command_type == kPropagate && c.arg2 > 0) {
+      KALDI_ASSERT(c.arg2 < num_precomputed_indexes);
+      component_index[c.arg2] = c.arg1;
+    }
+    if ((c.command_type == kBackprop ||
+         c.command_type == kBackpropNoModelUpdate) && c.arg2 > 0) {
+      KALDI_ASSERT(c.arg2 < num_precomputed_indexes);
+      need_backprop[c.arg2] = true;
+    }
+  }
+
+  for (size_t p = 1;
+       p < expanded_computation_->component_precomputed_indexes.size();
+       ++p)
+    delete expanded_computation_->component_precomputed_indexes[p].data;
+  expanded_computation_->component_precomputed_indexes.clear();
+  expanded_computation_->component_precomputed_indexes.resize(
+      num_precomputed_indexes);
+
+  for (int32 p = 1; p < num_precomputed_indexes; ++p) {
+    const NnetComputation::PrecomputedIndexesInfo &old_info =
+        computation_.component_precomputed_indexes[p];
+    NnetComputation::PrecomputedIndexesInfo &new_info =
+        expanded_computation_->component_precomputed_indexes[p];
+    KALDI_ASSERT(!old_info.input_indexes.empty() &&
+                 !old_info.output_indexes.empty() &&
+                 "Input/output indexes not present in precomputed info of "
+                 "computation to be expanded.");
+    // note: we could place these expanded indexes into 'new_info.input_indexes'
+    // and 'new_info.output_indexes', but we actually don't need to keep them
+    // there, because they are only required to be kept in computations where
+    // the n indexes consist of the set (0, 1), and the computation we're
+    // creating has more distinct n indexes than that.
+    std::vector<Index> input_indexes, output_indexes;
+    ExpandIndexes(old_info.input_indexes, &input_indexes);
+    ExpandIndexes(old_info.output_indexes, &output_indexes);
+    KALDI_ASSERT(component_index[p] >= 0);
+    const Component *component = nnet_.GetComponent(component_index[p]);
+    ComponentPrecomputedIndexes *expanded_precomputed_indexes =
+        component->PrecomputeIndexes(misc_info_, input_indexes,
+                                     output_indexes, need_backprop[p]);
+    // this object should not be null because it was not NULL the
+    // last time we generated it from the same component, for the
+    // same computation.
+    KALDI_ASSERT(expanded_precomputed_indexes != NULL);
+    new_info.data = expanded_precomputed_indexes;
+  }
+}
+
+
+bool ComputationExpander::GetNewSubmatLocationInfo(
+    int32 submat_index, int32 old_row_index,
+    int32 *new_row_index, int32 *new_n_stride) const {
+  int32 matrix_index = computation_.submatrices[submat_index].matrix_index,
+   old_row_offset = computation_.submatrices[submat_index].row_offset,
+   new_row_offset = expanded_computation_->submatrices[submat_index].row_offset;
+
+  const NnetComputation::MatrixDebugInfo &debug_info_in =
+      computation_.matrix_debug_info[matrix_index];
+  if (debug_info_in.cindexes[old_row_index + old_row_offset].second.n != 0)
+    return false;
+  GetNewMatrixLocationInfo(matrix_index, old_row_index + old_row_offset,
+                           new_row_index, new_n_stride);
+  *new_row_index -= new_row_offset;
+  return true;
+}
+
+void ComputationExpander::GetNewMatrixLocationInfo(
+    int32 old_matrix_index, int32 old_row_index,
+    int32 *new_row_index, int32 *new_n_stride) const {
+  bool n_is_fast = n_fast_[old_matrix_index];
+  int32 num_rows = computation_.matrices[old_matrix_index].num_rows;
+  if (n_is_fast) {
+    // If the n index varies fast for this matrix, then the old row-index
+    // should be a multiple of 2 because:
+    //  - we assume that the input computation was built for 2 n-values
+    //  - if n varies fast then the cindexes for this matrix in the input
+    //    computation would come in pairs, for n=(0,1)
+    //  - the cindex that 'old_row_index' represents must be for n=0
+    //    (this is a requirement of this function)
+    KALDI_ASSERT(old_row_index % 2 == 0);
+    *new_n_stride = 1;
+    // the row-index of the element in question with n=0 will get larger if n
+    // varies 'fast', because each block of elements with a certain (x,t) value
+    // grows in size by a factor of num_n_values_ / 2.0.
+    *new_row_index = (old_row_index / 2) * num_n_values_;
+  } else {
+    // n varies more slowly, the cindexes are in blocks where the
+    // first block has n=0, the second has n=1, and so on.
+    // Because we assume that the cindex that lives in this location
+    // has n == 0, its position does not change (so new_row_index ==
+    // old_row_index).
+    *new_row_index = old_row_index;
+    *new_n_stride = (num_rows / 2);
+  }
+}
+
+
+void ComputationExpander::ExpandIndexes(
+    const std::vector<Index> &indexes,
+    std::vector<Index> *indexes_expanded) const {
+  bool is_fast = GetFastInfo(indexes);
+  int32 num_n_values = num_n_values_,
+      old_size = indexes.size(),
+      new_size = (old_size / 2) * num_n_values;
+  indexes_expanded->resize(new_size);
+  Index *indexes_expanded_ptr = &((*indexes_expanded)[0]);
+  for (int32 i = 0; i < old_size; i++) {
+    if (indexes[i].n == 0) {
+      int32 new_i_n0, new_n_stride;
+      int32 t = indexes[i].t, x = indexes[i].x;
+      GetNewLocationInfo(indexes, is_fast, i, &new_i_n0, &new_n_stride);
+      for (int32 n = 0; n < num_n_values; n++) {
+        int32 new_i = new_i_n0 + (n * new_n_stride);
+        KALDI_ASSERT(new_i < new_size);
+        indexes_expanded_ptr[new_i].n = n;
+        indexes_expanded_ptr[new_i].t = t;
+        indexes_expanded_ptr[new_i].x = x;
+      }
+    }
+  }
+}
+
+
+void ComputationExpander::GetNewLocationInfo(
+    const std::vector<Index> &indexes, bool is_fast,
+    int32 old_index, int32 *new_index, int32 *new_n_stride) const {
+  int32 num_indexes = indexes.size();
+  KALDI_ASSERT(num_indexes > 0 && num_indexes % 2 == 0 &&
+               indexes.front().n == 0 && indexes.back().n == 1);
+  if (is_fast) {
+    // If the n index varies fast for this matrix, then the old row-index
+    // should be a multiple of 2 because:
+    //  - we assume that the input computation was built for 2 n-values
+    //  - if n varies fast then the cindexes for this matrix in the input
+    //    computation would come in pairs, for n=(0,1)
+    //  - the cindex that 'old_row_index' represents must be for n=0
+    //    (this is a requirement of this function)
+    KALDI_ASSERT(old_index % 2 == 0);
+    *new_n_stride = 1;
+    // the row-index of the element in question with n=0 will get larger if n
+    // varies 'fast', because each block of elements with a certain (x,t) value
+    // grows in size by a factor of num_n_values_ / 2.0.
+    *new_index = (old_index / 2) * num_n_values_;
+  } else {
+    // n varies more slowly; the Indexes are in blocks where the
+    // first block has n=0, the second has n=1, and so on.
+    // Because we assume that the cindex that lives in this location
+    // has n == 0, its position does not change (so new_row_index ==
+    // old_row_index).
+    *new_index = old_index;
+    *new_n_stride = (num_indexes / 2);
+  }
+}
+
+
+void ExpandComputation(const Nnet &nnet,
+                       const MiscComputationInfo &misc_info,
+                       const NnetComputation &computation,
+                       bool need_debug_info,
+                       int32 num_n_values,
+                       NnetComputation *expanded_computation) {
+  ComputationExpander expander(nnet, misc_info, computation,
+                               need_debug_info, num_n_values,
+                               expanded_computation);
+  expander.Expand();
+}
+
+
+
+// This helper function is used in RequestIsDecomposable(); you can work out
+// what it does, and why, from the documentation of RequestIsDecomposable() in
+// the header.
+static bool IoSpecificationIsDecomposable(const IoSpecification &io_spec,
+                                          IoSpecification *mini_io_spec,
+                                          int32 *num_n_values_out) {
+  mini_io_spec->name = io_spec.name;
+  mini_io_spec->has_deriv = io_spec.has_deriv;
+  const std::vector<Index> &indexes = io_spec.indexes;
+  KALDI_ASSERT(!indexes.empty() && "Empty Indexes in computation request");
+  // For a computation to be decomposable, the 'n' values need to vary from 0 to
+  // N-1 for some N > 2, and they need to be in some kind of regular order with
+  // suitable repetition-- either with the 'n' values varying the 'fastest', or
+  // the 'slowest' of all the indexes.
+  if (indexes[0].n != 0 || indexes.back().n < 2) {
+    return false;
+  }
+  int32 num_n_values = indexes.back().n + 1,
+      size = indexes.size();
+  *num_n_values_out = num_n_values;
+  if (size % num_n_values != 0)
+    return false;
+  bool n_fast = (indexes[1].n == 1);
+  // if 'n_fast' is true, then the n index varies the fastest (stride == 1),
+  // otherwise it varies the slowest of any index.  We require that it be one of
+  // these two options, otherwise we declare the computation to be
+  // non-decomposable.
+
+  mini_io_spec->indexes.resize((size / num_n_values) * 2);
+  if (n_fast) {
+    // 'block_size' is the size of blocks with the same x,t values, which are
+    // expected to have n values 0, 1, ... num_n_values - 1.
+    // of course each block is of size num_n_values.
+    int32 num_blocks = size / num_n_values;
+    const Index *indexes_ptr = &(indexes[0]);
+    Index *indexes_out = &(mini_io_spec->indexes[0]);
+    for (int32 block = 0; block < num_blocks; block++) {
+      *(indexes_out++) = indexes_ptr[0];  // for n == 0
+      *(indexes_out++) = indexes_ptr[1];  // for n == 1.
+
+      // we expect all the indexes in this block to have the same x and t
+      // values, but n values increasing from 0 to num_n_values - 1.
+      int32 t = indexes_ptr->t, x = indexes_ptr->x;
+
+      for (int32 n = 0; n < num_n_values; n++, indexes_ptr++) {
+        if (indexes_ptr->n != n || indexes_ptr->t != t || indexes_ptr->x != x)
+          return false;
+      }
+    }
+  } else {
+    // 'n' varies the slowest.
+    int32 block_size = size / num_n_values;
+    mini_io_spec->indexes.clear();
+    mini_io_spec->indexes.insert(mini_io_spec->indexes.end(),
+                                 indexes.begin(),
+                                 indexes.begin() + 2 * block_size);
+
+    // now verify that it has the expected structure...
+    for (int32 i = 0; i < block_size; i++) {
+      const Index *indexes_ptr = &(indexes[i]);
+      int32 t = indexes_ptr->t, x = indexes_ptr->x;
+      for (int32 n = 0; n < num_n_values; n++, indexes_ptr += block_size) {
+        if (indexes_ptr->n != n || indexes_ptr->t != t || indexes_ptr->x != x)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool RequestIsDecomposable(const ComputationRequest &request,
+                           ComputationRequest *mini_request,
+                           int32 *num_n_values) {
+  size_t num_inputs = request.inputs.size(),
+      num_outputs = request.outputs.size();
+  mini_request->inputs.resize(num_inputs);
+  mini_request->outputs.resize(num_outputs);
+  mini_request->need_model_derivative = request.need_model_derivative;
+  mini_request->store_component_stats = request.store_component_stats;
+  mini_request->misc_info = request.misc_info;
+
+  KALDI_ASSERT(num_inputs != 0 && num_outputs != 0);
+  for (size_t i = 0; i < num_inputs; i++) {
+    int32 this_num_n_values = 0;
+    if (!IoSpecificationIsDecomposable(request.inputs[i],
+                                       &(mini_request->inputs[i]),
+                                       &this_num_n_values))
+      return false;
+    if (i == 0) {
+      *num_n_values = this_num_n_values;
+    } else {
+      if (this_num_n_values != *num_n_values)
+        return false;  // .. which would be odd.
+    }
+  }
+  for (size_t i = 0; i < num_outputs; i++) {
+    int32 this_num_n_values = 0;
+    if (!IoSpecificationIsDecomposable(request.outputs[i],
+                                       &(mini_request->outputs[i]),
+                                       &this_num_n_values))
+      return false;
+    if (this_num_n_values != *num_n_values)
+      return false;  // .. which would be odd.
+  }
+  return true;
+}
+
+
+class ComputationLoopedOptimizer {
+ public:
+  ComputationLoopedOptimizer(const Nnet &nnet,
+                             NnetComputation *computation):
+      nnet_(nnet), computation_(computation) { }
+  bool Optimize();
+
+ private:
+
+  // Figures out the time shift between the successive computation requests.
+  static int32 FindTimeShift(const NnetComputation &computation,
+                             const std::vector<int32> &segment_ends);
+
+  // This function creates a mapping from a matrix-index > 0,
+  // to a pair (unique_id, time_offset) that represents the debug-info
+  // for that matrix-id in computation.debug_info.
+  // The output vector is indexed by the matrix-index in the computation (the
+  // zeroth member is not valid).  It requires that the
+  // The 'time_offset' is equal to the 't' value of the zeroth element of the
+  // cindexes vetor.  The 'unique_id' is an integer that uniquely identifies
+  // what we get from subtracting the 'time_offset' from each 't' value of
+  // that 'cindexes' vector, and then pairing it up with the 'is_deriv'
+  // value of the DebugInfo.  That is, if two 'cindexes' vectors differ only
+  // by a time offset, and the 'is_deriv' values are the same they will map to the same
+  // unique_id.
+  // The output 'matrix_to_pair' is indexed by matrix index (the zeroth element is
+  // not set).
+  static void CreateMatrixPairs(const NnetComputation &computation,
+                                std::vector<std::pair<int32, int32> > *matrix_to_pair);
+
+
+  // This very simple helper function reverses the map 'matrix_to_pair' so we can
+  // do the reverse lookup.  It outputs a map from pair to matrix index m, where
+  // 1 <= m < matrix_to_pair.size().
+  static void GetPairToMatrixMap(
+      std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix);
+
+
+  // Given a vector of lists, one list for each segment, of the active matrices
+  // at the end of that segment, this function converts those lists into a
+  // different representation where each matrix is reprented as a pair instead
+  // of as a single int32.  'active_pairs' will have the same dimensions as
+  // 'active_matrices'.
+  static void ConvertListsToPairLists(
+      const std::vector<std::vector<int32> > &active_matrices,
+      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs);
+
+  // This function modifies the lists of active matrices per segment
+  // (represented as pairs) in 'active_pairs' by sorting them and
+  // then subtracting the time-offset of the first pair in each
+  // list ((*active_pair)[seg][0].second), from all elements in that list.
+  // It puts the subtracted offset in (*time_offsets)[seg].  This change
+  // of representation makes it easy to tell whether the sets of active
+  // matrices for different segments are identical up to a time-offset.
+  static void NormalizePairLists(
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
+      std::vector<int32> *time_offsets);
+
+  // This function looks in the matrix 'active_pairs' for the first pair of
+  // identical values, i.e. it is looking for i < j for which
+  // normalized_active_pairs[i] == normalized_active_pairs[j].  (However, the
+  // pair i,j must satisfy an extra condition, see below).  If a pair
+  // i,j exists satisfying these conditions, this function outputs them to *seg1
+  // and *seg2, and returns true; otherwise it returns false.
+  //
+  // Extra condition:
+  // It turns out that under some circumstances, we can
+  // fine repeats that were not "really" repeats (the matrices were not time
+  // shifted) The situation was a bit obscure (it was a non-recurrent setup with
+  // a lot of extra-right-context, where some inputs were never used), but to
+  // prevent it happening again we are now checking in addition to the above,
+  // that the time-shift between the segments (i.e. time_offsets[j] -
+  // time_offsets[i]), has the "expected value" based on the assumption that
+  // each segment should be shifted relative to the previous segment, by
+  // 'time_shift_per_segment'.
+  static bool FindFirstRepeat(
+      const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
+      const std::vector<int32> &time_offsets,
+      int32 time_shift_per_segment,
+      int32 *seg1, int32 *seg2);
+
+  // Converts a list of pairs (e.g. one of the elements of the output of
+  // 'ConvertListsToPairLists)', back into a list of matrix indexes, using the
+  // map 'pair_to_matrix'.
+  static void PairListToMatrixList(
+      const std::vector<std::pair<int32, int32> > &pair_list,
+      const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
+      std::vector<int32> *matrix_list);
+
+
+  // This function just does some checking (via asserts), that
+  // the lists of matrices 'list1' and 'list2' are of the same length,
+  // that time_difference > 0, that each matrix with index m = list2[i] is of the
+  // same dimension as the list1[i], with Cindexes that are the same except for
+  // the time index being greater by 'time_difference'
+  static void CheckIdentifiedMatrices(
+      const NnetComputation &computation,
+      const std::vector<int32> &list1,
+      const std::vector<int32> &list2,
+      int32 time_difference);
+
+
+  // Given two command indexes command1 < command2 pointing to commands of type
+  // kNoOperationMarker, this function modifies the computation by
+  // removing all commands after command2, replacing command2 with a kGotoLabel
+  // command pointing to command1  and then inserting just before command1
+  // a marker of type kNoOperationLabel.
+  static void FormInfiniteLoop(int32 command1, int32 command2,
+                               NnetComputation *computation);
+
+  // This is to be called after FormInfiniteLoop.  It inserts, just before
+  // the final kGotoLabel command, commands that initialize
+  // each of the matrices in list 'matrices1' from the corresponding
+  // matrix in 'matrices2', using the kAllocMatrixFromOther command.
+  // This effectively does, for example, matrices1[i] = matrices2[i],
+  // while initializing matrices1[i] and deallocating matrices2[i];
+  // it's implemented as a shallow swap.
+  // It does this in such an order that even if the two lists are
+  // not disjoint, the right thing happens.
+  static void AddMatrixSwapCommands(
+      const std::vector<int32> &matrices1,
+      const std::vector<int32> &matrices2,
+      NnetComputation *computation);
+
+
+  // Called from AddMatrixSwapCommands, this function figures out for us
+  // an acceptable order in which to execute the kAllocMatrixFromOther
+  // commands.  This is easy to do if matrices1 and matrices2 are disjoint
+  // sets, but has to be done more carefully if they overlap.
+  // The output is a list of pairs where each pair (a, b) comes from
+  // from matrices1 and matrices2 in the same position, i.e.
+  // a = matrices1[i] and b = matrices2[i].
+  static void GetMatrixSwapOrder(
+      const std::vector<int32> &matrices1,
+      const std::vector<int32> &matrices2,
+      std::vector<std::pair<int32, int32> > *swaps);
+
+
+
+  /// Given a list of command indexes ('segment_end_commands') which are
+  /// expected to be command indexes of the kNoOperationMarker at segment
+  /// boundaries, this function outputs for each of these command indexes a list
+  /// of matrices which are 'active' at that point in time.  By 'active' we mean
+  /// that the matrix has been written to before that time (note, we don't count
+  /// initialization with zeros as being written to); and will be read after
+  /// that time.  These is the list of matrices that 'need to be in scope'
+  /// at those points in time.  '*active_matrices' is indexed by the
+  /// same index as 'segment_end_commands', and is then a list of active
+  /// matrices, in numerical order of matrix index.
+  /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
+  static void FindActiveMatrices(const NnetComputation &computation,
+                                 const Analyzer &analyzer,
+                                 const std::vector<int32> &segment_end_commands,
+                                 std::vector<std::vector<int32> > *active_matrices);
+
+
+  const Nnet &nnet_;
+  NnetComputation *computation_;
+  Analyzer analyzer_;
+  std::vector<std::pair<int32, int32> > matrix_to_pair_;
+
+  std::vector<int32> segment_end_commands_;
+};
+
+// static
+int32 ComputationLoopedOptimizer::FindTimeShift(
+    const NnetComputation &computation,
+    const std::vector<int32> &segment_ends) {
+  KALDI_ASSERT(segment_ends.size() >= 3);
+  // Ignore the first segment as it tends to be a special case
+  // (it has more left context),
+  int32 second_segment_begin = segment_ends[0],
+      third_segment_begin = segment_ends[1],
+      fourth_segment_begin = segment_ends[2];
+  int32 first_output_command_seg2 = -1,
+      first_output_command_seg3 = -1;
+  for (int32 c = second_segment_begin; c < third_segment_begin; c++)
+    if (computation.commands[c].command_type == kProvideOutput &&
+        first_output_command_seg2 < 0)
+      first_output_command_seg2 = c;
+  for (int32 c = third_segment_begin; c < fourth_segment_begin; c++)
+    if (computation.commands[c].command_type == kProvideOutput &&
+        first_output_command_seg3 < 0)
+      first_output_command_seg3 = c;
+  if (first_output_command_seg2 < 0 ||
+      first_output_command_seg3 < 0)
+    KALDI_ERR << "Could not locate output commands for segments 2 and 3.";
+  const NnetComputation::Command
+      &command2 = computation.commands[first_output_command_seg2],
+      &command3 = computation.commands[first_output_command_seg3];
+  int32 seg2_node = command2.arg2, seg3_node = command3.arg2;
+  KALDI_ASSERT(seg2_node == seg3_node);
+  int32 seg2_submatrix = command2.arg1,
+      seg3_submatrix = command3.arg1;
+  KALDI_ASSERT(computation.IsWholeMatrix(seg2_submatrix) &&
+               computation.IsWholeMatrix(seg3_submatrix));
+  int32 seg2_matrix = computation.submatrices[seg2_submatrix].matrix_index,
+      seg3_matrix = computation.submatrices[seg3_submatrix].matrix_index;
+  KALDI_ASSERT(computation.matrices[seg2_matrix].num_rows ==
+               computation.matrices[seg3_matrix].num_rows);
+  KALDI_ASSERT(!computation.matrix_debug_info.empty());
+  const NnetComputation::MatrixDebugInfo
+      &debug_info2 = computation.matrix_debug_info[seg2_matrix],
+      &debug_info3 = computation.matrix_debug_info[seg3_matrix];
+  int32 t_offset = debug_info3.cindexes[0].second.t -
+      debug_info2.cindexes[0].second.t;
+  int32 num_rows = debug_info2.cindexes.size();
+  for (int32 r = 0; r < num_rows; r++) {
+    KALDI_ASSERT(debug_info3.cindexes[r].second.t ==
+                 debug_info2.cindexes[r].second.t + t_offset);
+  }
+  return t_offset;
+}
+
+// static
+void ComputationLoopedOptimizer::CreateMatrixPairs(
+    const NnetComputation &computation,
+    std::vector<std::pair<int32, int32> > *matrix_to_pair) {
+  typedef unordered_map<std::vector<Cindex>, int32,
+                        CindexVectorHasher> MapType;
+  int32 cur_vector_id = 1;
+  // Note: cindex_map just maps the vector<Cindex> to a unique value,
+  // and then we manually work out a unique id that takes into
+  // account the 'is_deriv' values.
+  MapType cindex_map;
+  int32 num_matrices = computation.matrices.size();
+  matrix_to_pair->resize(num_matrices);
+  KALDI_ASSERT(computation.matrix_debug_info.size() == num_matrices);
+  for (int32 m = 1; m < num_matrices; m++) {
+    KALDI_ASSERT(!computation.matrix_debug_info[m].cindexes.empty());
+    std::vector<Cindex> cindexes = computation.matrix_debug_info[m].cindexes;
+    int32 t_offset = cindexes[0].second.t;
+    for (std::vector<Cindex>::iterator iter = cindexes.begin();
+         iter != cindexes.end(); ++iter)
+      iter->second.t -= t_offset;
+    MapType::const_iterator iter = cindex_map.find(cindexes);
+    int32 vector_id;
+    if (iter != cindex_map.end()) {
+      vector_id = iter->second;
+    } else {
+      vector_id = cur_vector_id++;
+      cindex_map[cindexes] = vector_id;
+    }
+    bool is_deriv = computation.matrix_debug_info[m].is_deriv;
+    int32 unique_id = 2 * vector_id + (is_deriv ? 1 : 0);
+    (*matrix_to_pair)[m].first = unique_id;
+    (*matrix_to_pair)[m].second = t_offset;
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::GetPairToMatrixMap(
+      std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > *pair_to_matrix) {
+  int32 num_matrices = matrix_to_pair.size();
+  // actually there are one fewer matrices than num_matrices.
+  pair_to_matrix->clear();
+  for (int32 m = 1; m < num_matrices; m++)
+    (*pair_to_matrix)[matrix_to_pair[m]] = m;
+}
+
+
+// static
+void ComputationLoopedOptimizer::ConvertListsToPairLists(
+      const std::vector<std::vector<int32> > &active_matrices,
+      const std::vector<std::pair<int32, int32> > &matrix_to_pair,
+      std::vector<std::vector<std::pair<int32, int32> > > *active_pairs) {
+  active_pairs->clear();
+  active_pairs->resize(active_matrices.size());
+  int32 num_matrices = matrix_to_pair.size();
+  for (size_t seg = 0; seg < active_matrices.size(); seg++) {
+    const std::vector<int32> &this_active_matrix_list = active_matrices[seg];
+    std::vector<std::pair<int32, int32> > &this_active_pair_list =
+        (*active_pairs)[seg];
+    this_active_pair_list.resize(this_active_matrix_list.size());
+    std::vector<int32>::const_iterator iter = this_active_matrix_list.begin(),
+        end = this_active_matrix_list.end();
+    std::vector<std::pair<int32, int32> >::iterator
+        out_iter = this_active_pair_list.begin();
+    for (; iter != end; ++iter, ++out_iter) {
+      KALDI_ASSERT(*iter > 0 && *iter < num_matrices);
+      *out_iter = matrix_to_pair[*iter];
+    }
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::NormalizePairLists(
+    std::vector<std::vector<std::pair<int32, int32> > > *active_pairs,
+    std::vector<int32> *time_offsets) {
+  int32 num_segments = active_pairs->size();
+  time_offsets->resize(num_segments);
+  for (int32 seg = 0; seg < num_segments; seg++) {
+    std::vector<std::pair<int32, int32> > &this_pairs = (*active_pairs)[seg];
+    std::sort(this_pairs.begin(), this_pairs.end());
+    int32 this_offset;
+    if (!this_pairs.empty()) {
+      this_offset = this_pairs[0].second;
+    } else {
+      // if this_pairs is empty, produce arbitrary offsets that are increasing
+      // (this will keep some self-testing code happy).
+      if (seg == 0) { this_offset = 0; }
+      else { this_offset = (*time_offsets)[seg - 1] + 1; }
+    }
+    (*time_offsets)[seg] = this_offset;
+    std::vector<std::pair<int32, int32> >::iterator
+        iter = this_pairs.begin(), end = this_pairs.end();
+    for (; iter != end; ++iter)
+      iter->second -= this_offset;
+  }
+}
+
+
+// static
+bool ComputationLoopedOptimizer::FindFirstRepeat(
+    const std::vector<std::vector<std::pair<int32, int32> > > &normalized_active_pairs,
+    const std::vector<int32> &time_offsets,
+    int32 time_shift_per_segment,
+    int32 *seg1, int32 *seg2) {
+  int32 num_segments = normalized_active_pairs.size();
+  // This algorithm may seem like it would be very slow, but the number of
+  // segments will normally be quite small (e.g. 10), and the comparison of
+  // elements of 'normalized_active_pairs' should be fast in cases where they
+  // differ.
+  KALDI_ASSERT(num_segments >= 2);
+
+  bool perform_time_offset_check = true;
+  if (normalized_active_pairs.back().empty()) {
+    // If there are no variables active after the end of the last-but-one segment
+    // (which is the last element in segment_ends, since we remove the end of the
+    // very last segment), then don't perform the check related to
+    // time-offsets, it's not relevant.  [this would probably be a computation
+    // that doesn't require any context].
+    perform_time_offset_check = false;
+  }
+  for (int32 s = 0; s < num_segments; s++) {
+    for (int32 t = s + 1; t < num_segments; t++) {
+      if ((!perform_time_offset_check ||
+           time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
+          normalized_active_pairs[s] == normalized_active_pairs[t]) {
+        *seg1 = s;
+        *seg2 = t;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// static
+void ComputationLoopedOptimizer::PairListToMatrixList(
+    const std::vector<std::pair<int32, int32> > &pair_list,
+    const unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > &pair_to_matrix,
+    std::vector<int32> *matrix_list) {
+  matrix_list->resize(pair_list.size());
+  std::vector<std::pair<int32, int32> >::const_iterator
+      iter = pair_list.begin(), end = pair_list.end();
+  std::vector<int32>::iterator out_iter = matrix_list->begin();
+  for (; iter != end; ++iter, ++out_iter) {
+    unordered_map<std::pair<int32, int32>, int32,
+                  PairHasher<int32> >::const_iterator
+        map_iter = pair_to_matrix.find(*iter);
+    if (map_iter == pair_to_matrix.end()) {
+      KALDI_ERR << "Could not find pair in map (code error)";
+    }
+    *out_iter = map_iter->second;
+  }
+}
+
+
+
+// static
+void ComputationLoopedOptimizer::FindActiveMatrices(
+    const NnetComputation &computation,
+    const Analyzer &analyzer,
+    const std::vector<int32> &segment_end_commands,
+    std::vector<std::vector<int32> > *active_matrices) {
+  int32 num_matrices = computation.matrices.size();
+  int32 num_segments = segment_end_commands.size();
+  active_matrices->clear();
+  active_matrices->resize(num_segments);
+  // this object just makes available some extra functions, vs. the Analyzer
+  // object.
+  ComputationAnalysis analysis(computation, analyzer);
+  KALDI_ASSERT(IsSortedAndUniq(segment_end_commands));
+
+  // the following vector gives us, for each matrix index, a submatrix index
+  // that covers the whole of that matrix (needed by interface of 'analysis' object).
+  std::vector<int32> whole_submatrices;
+  computation.GetWholeSubmatrices(&whole_submatrices);
+  for (int32 m = 1; m < num_matrices; m++) {
+    // the following are command indexes, comparable with the indexes
+    // in 'segment_end_commands'.
+    int32 s = whole_submatrices[m],  // submatrix consisting of the whole of
+                                     // 'm'.
+        first_access = analysis.FirstAccess(s),
+        last_access = analysis.LastAccess(s);
+    for (int32 seg = 0; seg < num_segments; seg++) {
+      int32 segment_end = segment_end_commands[seg];
+      if (first_access < segment_end && last_access > segment_end) {
+        // If the block of time during which the matrix is accessed, includes
+        // this segment end-point, then the matrix is considered 'active' at
+        // that time.
+        (*active_matrices)[seg].push_back(m);
+      }
+    }
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::CheckIdentifiedMatrices(
+    const NnetComputation &computation,
+    const std::vector<int32> &list1,
+    const std::vector<int32> &list2,
+    int32 time_difference) {
+  KALDI_ASSERT(time_difference > 0);
+  KALDI_ASSERT(list1.size() == list2.size());
+  KALDI_ASSERT(!computation.matrix_debug_info.empty());
+  for (size_t i = 0; i < list1.size(); i++) {
+    int32 m1 = list1[i], m2 = list2[i];
+    const NnetComputation::MatrixInfo
+        &matrix_info1 = computation.matrices[m1],
+        &matrix_info2 = computation.matrices[m2];
+    KALDI_ASSERT(matrix_info1.num_rows == matrix_info2.num_rows &&
+                 matrix_info1.num_cols == matrix_info2.num_cols &&
+                 matrix_info1.stride_type == matrix_info2.stride_type);
+    const NnetComputation::MatrixDebugInfo
+        &debug_info1 = computation.matrix_debug_info[m1],
+        &debug_info2 = computation.matrix_debug_info[m2];
+    KALDI_ASSERT(debug_info1.is_deriv == debug_info2.is_deriv);
+    KALDI_ASSERT(debug_info1.cindexes.size() == debug_info2.cindexes.size());
+    std::vector<Cindex>::const_iterator iter1 = debug_info1.cindexes.begin(),
+        end1 = debug_info1.cindexes.end(),
+        iter2 = debug_info2.cindexes.begin();
+    for (; iter1 != end1; iter1++,iter2++) {
+      KALDI_ASSERT(iter2->first == iter1->first &&
+                   iter2->second.n == iter1->second.n &&
+                   iter2->second.t == iter1->second.t + time_difference &&
+                   iter2->second.x == iter1->second.x);
+    }
+  }
+}
+
+
+// static
+void ComputationLoopedOptimizer::GetMatrixSwapOrder(
+    const std::vector<int32> &matrices1,
+    const std::vector<int32> &matrices2,
+    std::vector<std::pair<int32, int32> > *swaps) {
+  KALDI_ASSERT(matrices1.size() == matrices2.size());
+  swaps->clear();
+  int32 num_matrices = matrices1.size();
+  std::vector<bool> processed(num_matrices, false);
+  std::vector<int32> queue;
+
+  // num_loops is just for infinite-loop detection.
+  int32 num_loops = 0;
+  for (; static_cast<int32>(swaps->size()) < num_matrices; num_loops++) {
+    for (int32 i = 0; i < num_matrices; i++) {
+      if (processed[i])
+        continue;
+      int32 m1 = matrices1[i], m2 = matrices2[i];
+      std::vector<int32>::const_iterator iter =
+          std::lower_bound(matrices2.begin(), matrices2.end(), m1);
+      if (iter == matrices2.end() || *iter != m1) {
+        // Matrix m1 does not appear in the list 'matrices2', so
+        // we are safe to process it at any time.
+        swaps->push_back(std::pair<int32,int32>(m1, m2));
+        processed[i] = true;
+      } else {
+        int32 m1_pos_in_matrices2 = iter - matrices2.begin();
+        if (processed[m1_pos_in_matrices2]) {
+          // We're safe to do this swap now, because the matrix m1 has already
+          // appeared on the RHS of a swap, and by this point has been
+          // deallocated, in effect.
+          swaps->push_back(std::pair<int32,int32>(m1, m2));
+          processed[i] = true;
+        }
+        // else do nothing, we cannot process m1 yet because
+        // at this point in the computation it is still allocated.
+      }
+    }
+    // The following assert is to check that we don't loop infinitely.  We can
+    // prove that infinite looping won't happen, after on proving that there can
+    // be no cycles like (m1, m2), (m2, m3), (m3, m1) (the length of 3 is chosen
+    // arbitrarily as an example).  If such a cycle existed, we can reach a
+    // contradiction based on the time-index (t) of the first cindex in m1.
+    // Define t1 = that time index, t2 the same for m2, t3 the same for m3.  The
+    // existence of the three pairs [as pairs like (matrices1[i], matrices2[i])]
+    // implies that t2 > t1, t3 > t2, and t1 > t3 respectively, but this is
+    // impossible.
+    // This shows that all chains of dependencies must terminate.
+    KALDI_ASSERT(num_loops <= num_matrices);
+  }
+}
+
+// static
+void ComputationLoopedOptimizer::AddMatrixSwapCommands(
+    const std::vector<int32> &matrices1,
+    const std::vector<int32> &matrices2,
+    NnetComputation *computation) {
+  std::vector<std::pair<int32, int32> > swaps;
+  // Note: in 'easy' cases where matrices1 and matrices2 are disjoint,
+  // 'swaps' will just be the vector { (matrices1[0],matrices2[0]),
+  // (matrices1[1],matrices2[1]), ... },
+  // but in some cases these may need to get reordered.
+  GetMatrixSwapOrder(matrices1, matrices2, &swaps);
+
+  NnetComputation::Command goto_label_command = computation->commands.back();
+  KALDI_ASSERT(goto_label_command.command_type == kGotoLabel);
+  computation->commands.pop_back();
+
+  // the following vector gives us, for each matrix index, a submatrix index
+  // that covers the whole of that matrix (needed because the commands
+  // require submatrix indexes)
+  std::vector<int32> whole_submatrices;
+  computation->GetWholeSubmatrices(&whole_submatrices);
+  size_t num_matrices = whole_submatrices.size();
+
+  for (size_t i = 0; i < swaps.size(); i++) {
+    int32 m1 = swaps[i].first, m2 = swaps[i].second;
+    KALDI_ASSERT(static_cast<size_t>(m1) < num_matrices &&
+                 static_cast<size_t>(m2) < num_matrices);
+    int32 s1 = whole_submatrices[m1], s2 = whole_submatrices[m2];
+    computation->commands.push_back(
+        NnetComputation::Command(
+            kAllocMatrixFromOther, s1, s2));
+  }
+  computation->commands.push_back(goto_label_command);
+}
+
+// static
+void ComputationLoopedOptimizer::FormInfiniteLoop(
+    int32 command1, int32 command2,
+    NnetComputation *computation) {
+  KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
+               command2 + 1 && command1 < command2);
+  KALDI_ASSERT(
+      computation->commands[command1].command_type == kNoOperationMarker &&
+      computation->commands[command2].command_type == kNoOperationMarker);
+  // Remove any commands after 'command2'.
+  computation->commands.resize(command2 + 1);
+  computation->commands[command2].command_type = kGotoLabel;
+  computation->commands[command2].arg1 = command1;
+  NnetComputation::Command c(kNoOperationLabel);
+  computation->commands.insert(computation->commands.begin() + command1,
+                               c);
+  // Now the kNoOperationLabel command is at position 'command1'.
+}
+
+
+
+bool ComputationLoopedOptimizer::Optimize() {
+  analyzer_.Init(nnet_, *computation_);
+  KALDI_ASSERT(!computation_->matrix_debug_info.empty() &&
+               "You must request matrix debug info when compiling "
+               "looped computations.");
+
+  // get the indexes of the separator commands at the ends of segments.
+  std::vector<int32> segment_ends;
+  GetSegmentEnds(*computation_, &segment_ends);
+  int32 time_shift_per_segment = FindTimeShift(*computation_,
+                                               segment_ends);
+
+  // Ignore the end of the very last segment- it is not a candidate for a
+  // 'splice point'.  What we're doing here is like creating a tape loop; we
+  // have to find a place where the list of variables is the same except for a
+  // time offset.
+  // [note: it's not exactly like a tape loop because the prologue can
+  // vary... the sequence is of the form like a b b b b b .. ]
+  segment_ends.pop_back();
+
+
+  std::vector<std::vector<int32> > active_matrices;
+  // Find the list of matrices active at each of those segment-end-command
+  // times.
+  FindActiveMatrices(*computation_, analyzer_, segment_ends,
+                     &active_matrices);
+
+  // Find a representation of the matrices of the computation as pairs
+  // (unique_id, time_offset) that are more amenable to finding
+  // matrices that represet lists of Cindexes that differ only by
+  // a time offset.
+  std::vector<std::pair<int32, int32> > matrix_to_pair;
+  CreateMatrixPairs(*computation_, &matrix_to_pair);
+
+  // Create the reverse map from pair to matrix index; we'll need it.
+  unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
+  GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
+
+  // get lists of matrix per segment in the pair representation.
+  std::vector<std::vector<std::pair<int32, int32> > > pair_lists;
+  ConvertListsToPairLists(active_matrices, matrix_to_pair,
+                          &pair_lists);
+
+  std::vector<int32> time_offsets;
+  NormalizePairLists(&pair_lists, &time_offsets);
+
+  // Note: seg1 and seg2 are indexes into 'segment_ends', representing
+  // points in time (that happen to be the ends of segments).
+  int32 seg1, seg2;
+  if (!FindFirstRepeat(pair_lists,
+                       time_offsets,
+                       time_shift_per_segment,
+                       &seg1, &seg2)) {
+    KALDI_VLOG(2) << "Could not find repeats of variables.";
+    return false;
+  }
+
+  // reverse the normalization for segments seg1 and seg2.
+  for (size_t i = 0; i < pair_lists[seg1].size(); i++)
+    pair_lists[seg1][i].second += time_offsets[seg1];
+  for (size_t i = 0; i < pair_lists[seg2].size(); i++)
+    pair_lists[seg2][i].second += time_offsets[seg2];
+  std::vector<int32> seg1_matrices, seg2_matrices;
+  PairListToMatrixList(pair_lists[seg1], pair_to_matrix, &seg1_matrices);
+  PairListToMatrixList(pair_lists[seg2], pair_to_matrix, &seg2_matrices);
+
+  int32 time_difference = time_offsets[seg2] - time_offsets[seg1];
+  CheckIdentifiedMatrices(*computation_, seg1_matrices, seg2_matrices,
+                          time_difference);
+
+
+  FormInfiniteLoop(segment_ends[seg1], segment_ends[seg2], computation_);
+
+  AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
+
+  RenumberComputation(computation_);
+
+  FixGotoLabel(computation_);
+
+  return true;
+}
+
+
+void OptimizeLoopedComputation(const Nnet &nnet,
+                               NnetComputation *computation) {
+  ComputationLoopedOptimizer optimizer(nnet, computation);
+  optimizer.Optimize();
+}
+
+
+
+void FixGotoLabel(NnetComputation *computation) {
+  int32 num_commands = computation->commands.size();
+  if (num_commands == 0)
+    return;
+  for (int32 c = num_commands - 1; c >= 0; c--) {
+    if (computation->commands[c].command_type == kGotoLabel) {
+      int32 dest_command = computation->commands[c].arg1;
+      if (static_cast<size_t>(dest_command) <  computation->commands.size() &&
+          computation->commands[dest_command].command_type == kNoOperationLabel)
+        return;  // nothing to fix.
+      for (int32 d = 0; d + 1 < num_commands; d++) {
+        if (computation->commands[d].command_type == kNoOperationLabel) {
+          computation->commands[c].arg1 = d;
+          return;
+        }
+      }
+      KALDI_ERR << "Label not found.";
+    } else if (computation->commands[c].command_type == kProvideOutput) {
+      // sometimes kProvideOutput commands are temporarily ordered after
+      // the kGotoLabel command, and we need to work in that case.
+      continue;
+    } else {
+      // it loks like there is no 'goto' command in this computation-
+      // if there were, it would be right at the end, possibly followed by
+      // kProvideOutput commands.
+      break;
+    }
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index d82867252ec..aec8c21a368 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -52,14 +52,11 @@ struct NnetOptimizeOptions;  // Forward declaration.
    may be sub-matrices of larger matrices.
 
    Note: the following
-     - Define last-access(submatrix) as:
-       If matrix-of(submatrix) is an output, then num-commands, otherwise the
+     - Define last-access(submatrix) as the
        last command that accesses that submatrix for either read or write.  [note:
        deallocation does not count as a read or write operation].
-     - Define first-access(submatrix) as:
-       If matrix-of(submatrix) is an input, then -1, otherwise the first command
-       that is *not* an allocation command that accessed that submatrix for either
-       read or write.
+     - Define first-access(submatrix) as the first command not of type kAlloc*
+       that accessed that submatrix for either read or write.
      - Define last-write-access(submatrix) as the last command-index that accessed
        the submatrix in a write operation, or -1 if there is no such command (this
        could happen for inputs).
@@ -99,53 +96,41 @@ struct NnetOptimizeOptions;  // Forward declaration.
    Otherwise (cases (b) and (c), in-place propagate or backprop), we insist that:
      - first-access(s2) == C
      - last-access(s1) == C
-   Note: in either case, these conditions imply that s2 is not an input and s1 is
-   not an output.
+   Note: in either case, these conditions imply that m2/s2 is not an input and m1/s1 is
+   not an output.  [i.e. s1 *may* be an input and s2 *may* be an output].
+
+   We can explain the procedure for both left-merge and right-merge in one, because
+   it's the same.  Define s_to_keep and m_to_keep as s1 and m1 if we're left-merging
+   and s2 and m2 if we're right-merging, and s_to_discard and m_to_discard the opposite
+   way.
+
+   The procedure to merge in general is as follows:
 
-   The sequence of things we have to do for a right-merge (in which we delete
-   s1,m1) is as follows:
      - All submatrices that reference m1, make them reference m2 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m1 was an input, replace it as an input with m2 and remove the
-       command that allocated m2.
-     - If it was an assignment [case (a)], replace the assignment command with a
-       no-op.
-     - If both m1 and m2 have commands that allocate them, keep only the
-       allocation command for m2, and make sure that it zeroes the data (we can
-       later change to undefined if allowed) and that it's before the first
-       non-allocation access of m1.  Otherwise remove any allocation commands
-       (the merged variable is an input).
-     - If both m1 and m2 have commands that deallocate them, keep only the
-       deallocation command for m2, and make sure that it's after the last
-       access of m1 (otherwise delete any deallocation command, because m2 must
-       be an output).  [note: previously we kept the later of the 2 commands,
-       but this had the effect of making inaccurate the Analyzer info for
-       a matrix (m2) that might later be used.
-     - If m1 had stride_type == kStrideEqualNumCols, set m2's stride_type
-       to kStrideEqualNuMCols.
-
-
-   The sequence of things we have to do for a right-merge (in which we delete
-   s1,m1) is as follows:
-     - All submatrices that reference m2, make them reference m1 instead.
-       [later we'll renumber so that there are no duplicates.]
-     - If m2 was an output, replace it as an output with m1 and remove the
-       command that deallocated m1.
-     ... the last four bullet-points, regarding removing the assignment command,
-        and allocation and deallocation, and stride-type, are the same as for a
-        left-merge, except swap m1 and m2.
+       [later we'll renumber so that there are no duplicates.]  This automatically
+       takes care of making the input and output and allocation/deallocation
+       commands refer to the right matrix, in most cases.
+     - We need to get rid of duplicate or unnecessary allocation commands:
+       If m_to_discard is an input then get rid of the allocation command for
+       m_to_keep; otherwise get rid of the allocation command of m_to_discard.
+     - We need to get rid of duplicate or unnecessary deallocation commands:
+       If m_to_discard is an output then get rid of the deallocation command
+       for m_to_keep; otherwise get rid of the deallocation command for
+       m_to_discard.
 
    At the end when we call RemoveOrphanMatrices(), the renumbering code will
    automatically detect that there are duplicate submatrices, and will merge
    them, as well as removing the now-unused matrix indexes.  After merging, we
    will mark the variables (i.e. row-ranges) underlying s1 and s2 as being
-   "dirty" so they can no longer be merged during the lifetime of this class.
+   "dirty" so they can no longer be merged during the lifetime of this class--
+   this is so we don't have to think to hard; we apply this optimization
+   multiple times until it makes no change (see
+   nnet-optimize.cc:VariableMerginOptimization()).
  */
 class VariableMergingOptimizer {
  public:
   VariableMergingOptimizer(const NnetOptimizeOptions &config,
                            const Nnet &nnet,
-                           const ComputationRequest &request,
                            NnetComputation *computation);
   // Note: you can call this only once.  If it returns true, it means it has
   // merged variables.  In this case, you have the option to instantiate another
@@ -170,20 +155,10 @@ class VariableMergingOptimizer {
   ///  @param s2   [in]     A submatrix-index s2 > 0
   std::pair<bool,bool> MayBeMerged(int32 command, int32 s1, int32 s2) const;
 
-  // performs the left merge.  Search for left-merge in the comment
-  // above the class declaration for details.
-  void DoLeftMerge(int32 command_index, int32 s1, int32 s2);
-
-  // performs the right merge.  Search for right-merge in the comment
-  // above the class declaration for details.
-  void DoRightMerge(int32 command_index, int32 s1, int32 s2);
-
-  // Performs the actions common to both left and right merges, regarding
-  // removing the assignment command, and allocation and deallocation (called
-  // from DoLeftMerge and DoRightMerge).  The m_to_keep and m_to_discard
-  // are the matrix-indexes we will keep and discard respectively.
-  void DoMergeCommon(int32 command_index, int32 m_to_keep,
-                     int32 m_to_discard);
+  // Merges to matrices, whether left merge or right merge.  s_to_keep and
+  // s_to_discard are the submatrix-indexes we will keep and discard
+  // respectively (these are s1 and s2 in some order.
+  void DoMerge(int32 command_index, int32 s_to_keep, int32 m_to_discard);
 
   /// Marks the variables underlying submatrix 's' as dirty
   void MarkAsDirty(int32 s);
@@ -192,7 +167,6 @@ class VariableMergingOptimizer {
 
   const NnetOptimizeOptions &config_;
   const Nnet &nnet_;
-  const ComputationRequest &request_;
   NnetComputation *computation_;
 
   Analyzer analyzer_;
@@ -208,184 +182,29 @@ class VariableMergingOptimizer {
 };
 
 
-/** This class is responsible for consolidating the model-update part of
-    backprop commands, for components in (e.g.) recurrent networks that need to
-    have many separate backprop commands, into more efficient single commands
-    operating on consolidated data in larger matrices.  This is useful for
-    recurrent networks.  */
-class ModelUpdateConsolidator {
- public:
-  ModelUpdateConsolidator(const Nnet &nnet,
-                          NnetComputation *computation);
-  void ConsolidateModelUpdate();
- private:
-  void ConsolidateUpdateForComponent(
-      int32 component,
-      const std::vector<int32> &backprop_commands);
-
-  /// This function, called at the end of ConsolidateModelUpdate(), takes the
-  /// commands that we have put in extra_commands_, final_commands_ and
-  /// final_deallocate_commands_, and puts them in the appropriate place in
-  /// computation->commands_.
-  void AddCommandsToComputation();
-
-  /// You call this function when you want to consolidate the values of a list
-  /// of submatrices taken just prior to particular commands.  The input
-  /// 'commands' and 'submatrices' lists must be the same size, and size must be
-  /// > 1.  This function will create a new matrix that is the row-wise
-  /// concatentation of all these submatrices, with values taken just prior to
-  /// the respective command indexes.  This function will will add to
-  /// extra_commands_ the commands to do the copying at the appropriate places
-  /// (at the supplied command indexes; they will be inserted just before).  The
-  /// return value is the submatrix index of a submatrix that represents the
-  /// whole of the consolidated matrix.  This command will insert, at the
-  /// beginning of the computation (in extra_commands_[0]), a command to
-  /// initialize the matrix; and will append to final_deallocate_commands_ the
-  /// commands to deallocate the matrix.  If computation_->matrix_debug_info is
-  /// nonempty, this function will also update computation_->matrix_debug_info
-  /// with suitable values for the newly added matrix
-  int32 ConsolidateSubmatrices(
-      const std::vector<int32> &commands,
-      const std::vector<int32> &submatrices);
-
-  /// This function, called from ConsolidateSubmatrices, will
-  /// update 'debug_info' by appending the corresponding 'indexes' from
-  /// the existing debug info for this submatrix.  It will also set
-  /// the 'is_deriv' of '*debug_info' to the same value as the
-  /// debug info for 'submatrix_index', and set the 'node_index' to the
-  /// 'node_index' in the debug info for that submatrix-index.
-  /// It requires that computation_->matrix_debug_info be nonempty.
-  void AppendDebugInfoForSubmatrix(
-      int32 submatrix_index,
-      NnetComputation::MatrixDebugInfo *debug_info) const;
-
-  const Nnet &nnet_;
-  NnetComputation *computation_;
-
-  // Indexed by the original command index in *computation_ (and sized to the
-  // original number of commands in *computation_ before we added anything),
-  // extra_commands_[c] contains a list of commands that need to be inserted
-  // just before command c in the previously existing computation.
-  std::vector<std::vector<NnetComputation::Command> > extra_commands_;
-
-  // This is as list of kBackprop commands that will be placed after the
-  // commands in 'computation_->commands' and 'extra_commands_', but before
-  // the 'final_deallocate_commands_'.
-  std::vector<NnetComputation::Command> final_commands_;
-  // This is a list of commands to deallocate our 'consolidated' matrices; the
-  // commands will be placed after the commands in 'final_commands_'.
-  std::vector<NnetComputation::Command> final_deallocate_commands_;
-};
-
 
-// We declare this class in the .cc file, we don't need to export it.
-// It's used inside RenumberComputation.
-class ComputationRenumberer {
- public:
-  ComputationRenumberer(NnetComputation *computation):
-      computation_(computation) { }
+/**
+   This optimization consolidates
+   the model-update part of
+   backprop commands, for components in (e.g.) recurrent networks that need to
+   have many separate backprop commands, into more efficient single commands
+   operating on consolidated data in larger matrices.  This is useful for
+   recurrent networks.  The resulting computation separates the backprop for
+   data-derivatives from the model-update part of backprop.
+ */
+void ConsolidateModelUpdate(const Nnet &nnet,
+                            NnetComputation *computation);
 
-  void Renumber();
- private:
-  // this function removes unused vectors within the indexes_multi_ array, i.e.
-  // ones that are not referenced in the computation.
-  void RemoveUnusedIndexesMulti();
-  // this function computes the submatrix_is_used_ vector, saying whether each
-  // of the original submatrices is referenced somewhere.
-  void ComputeSubmatrixIsUsed();
-  // this function computes the matrix_is_used_ vector (from the
-  // submatrix_is_used_ vector, from computation_->input_output_info, and from
-  // computation_->commands, saying whether each of the original matrices is
-  // referenced somewhere, directly or indirectly.
-  void ComputeMatrixIsUsed();
-  // This function sets up mappings from old to new matrix and submatrix indexes,
-  // writing to num_{,sub}matrices_new_ and old_to_new_{,sub}matrix_.
-  void SetUpMappings();
-  // This function renumbers submatrix indexes appearing within commands and
-  // indexes_multi_, and then removes unused submatrices from the list of
-  // submatrices while leaving the matrix-indexes at their old values (they will
-  // be mapped by RenumberMatrices()).
-  void RenumberSubmatrices();
-  // renumber matrix indexes appearing within 'commmands', within 'submatrices'
-  // and 'input_output_info'; renumber 'matrices' and if applicable
-  // 'debug_info'.
-  void RenumberMatrices();
-  // removes duplicates within the indexes_multi array itself.
-  void RemoveIndexesMultiDuplicates();
-  // removes unused elements and duplicates within 'computation->indexes'
-  void RenumberIndexes();
-  // removes unused elements and duplicates within 'computation->indexes_ranges'
-  void RenumberIndexesRanges();
-
-  struct SubMatrixHasher {
-    SubMatrixHasher() { }
-    size_t operator () (const NnetComputation::SubMatrixInfo &submat) const {
-      // these numbers are arbitrarily chosen primes.
-      return submat.matrix_index +
-          19553 * submat.row_offset +
-          29297 * submat.num_rows +
-          42209 * submat.col_offset +
-          56527 * submat.num_cols;
-    }
-  };
 
 
-  // Here, T will be int32 or std::pair<int32,int32>
-  template <class T>
-  struct PointerCompare {
-    // This provides an operator < on two vectors of ints or pairs of ints.  It
-    // is designed to provide a total order on the vectors while accessing as
-    // small a portion of the vectors' data as possible.  It's used in removing
-    // duplicates from computation_->indexes_multi and computation_->indexes.
-    // First it compares the length, then it does lexicographical compare.
-    bool operator ()(const std::vector<T> *ptr1,
-                     const std::vector<T> *ptr2) const {
-      size_t size1 = ptr1->size(), size2 = ptr2->size();
-      if (size1 < size2) return true;
-      else if (size1 > size2) return false;
-      else return (*ptr1 < *ptr2);  // use the std::vector operator <, which is
-                                    // lexicographical comparison.
-    }
-  };
-
-  /// creates a renumbering that removes the elements in "to_remove",
-  /// e.g. if old_num_elements = 3 and to_remove = [1], would output
-  /// the vector [ 0, -1, 1 ].
-  static void CreateRenumbering(int32 old_num_elements,
-                                const std::vector<int32> &to_remove,
-                                std::vector<int32> *renumbering);
-
-  /// creates a renumbering from old to new index that removes the unused
-  /// elements, e.g. if used == [ true, false, true, true], would output the
-  /// vector [ 0, -1, 1, 2 ].  Returns number of new elements, i.e. the
-  /// number of elements of 'used' that were true.
-  static int32 CreateRenumbering(const std::vector<bool> &used,
-                                 std::vector<int32> *renumbering);
-
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index is used somewhere in the computation (always true for
-  // the zeroth element).
-  std::vector<bool> submatrix_is_used_;
-  // vector of bool indexed by original submatrix-index, that is true if a
-  // submatrix-index will be kept; this is like submatrix_is_used_; but for
-  // duplicate submatrices, all but the first duplicate will be marked false).
-  std::vector<bool> submatrix_is_kept_;
-  // vector of bool indexed by original-matrix-index > 0, that is true if a
-  // matrix-index is used somewhere in the computation, directly or indirectly.
-  // always true for the zeroth element.
-  std::vector<bool> matrix_is_used_;
-  NnetComputation *computation_;
-  int32 num_matrices_new_;
-  int32 num_submatrices_new_;
-  std::vector<int32> old_to_new_matrix_; // numbered by orig-matrix-index, gives
-                                         // new-matrix-index.  -1 for removed
-                                         // ones.
-  std::vector<int32> old_to_new_submatrix_; // numbered by orig-submatrix-index,
-                                            // gives new-submatrix-index.  -1
-                                            // for removed ones.
-};
-
 
+// Class DerivativeTimeLimiter is used inside LimitDerivativeTimes().
+// Its function is to modify the computation so that we don't work
+// with derivatives outside of a specified range of t values; this is
+// useful, for instance, in BLSTMs where you might have a fair amount of
+// left and right context in the training examples but don't want to
+// propagate the derivatives to there.
+//
 // We require that the computation have debug info set up
 // (!matrix_debug_info.empty()) and that this be the first
 // optimization you perform.  This means that the debug_info will
@@ -402,11 +221,6 @@ class DerivativeTimeLimiter {
 
  private:
 
-  // This command ensures that for each matrix m there is a corresponding
-  // submatrix that spans the entire matrix, and stores its index in
-  // entire_submatrix_[m].
-  void EnsureMatricesHaveEntireSubmatrices();
-
   // sets up matrix_prune_info_.
   void ComputeMatrixPruneInfo();
 
@@ -502,7 +316,7 @@ class DerivativeTimeLimiter {
 
   // for each matrix index > 0, the index of a submatrix that consists of
   // the entirety of that matrix.
-  std::vector<int32> entire_submatrix_;
+  std::vector<int32> whole_submatrices_;
 
   std::vector<MatrixPruneInfo> matrix_prune_info_;
 
@@ -522,6 +336,12 @@ class DerivativeTimeLimiter {
   std::vector<MatrixPruneInfo> prune_info_;
 };
 
+
+// This utility function, used in code that calls LimitDerivativeTimes(), returns
+// the largest time 't' in any of the 'outputs' in the computation request,
+// or crashes if there are no outputs (or no cindexes in those outputs).
+int32 MaxOutputTimeInRequest(const ComputationRequest &request);
+
 // This is the top-level interface to limit the times on which derivatives are
 // computed (e.g. for truncated BPTT); internally it uses class
 // DerivativeLimiter.  Will do nothing if min_deriv_time and max_deriv_time are
@@ -531,32 +351,104 @@ void LimitDerivativeTimes(const Nnet &nnet,
                           int32 max_deriv_time,
                           NnetComputation *computation);
 
+/**  This function, used in 'shortcut' compilation where we first compile a
+     smaller computation with the same structure but only 2 distinct 'n'
+     values, works out whether a computation is 'decomposable'; if so,
+     it returns true and outputs the 'mini_request' with the same structure,
+     and the number of 'n' values.
+
+     A computation is decomposable if the following conditions hold:
+
+      - All of its inputs and outputs contain 'n' values for all 0 <= n < N,
+        for some N > 2.  [we output this 'N' as 'num_n_values'].
+      - All of its inputs and outputs have 'regular' structure.
+
+        What it means for an input or output (i.e. an IoSpecification) to have a
+        'regular' structure, is as follows:
+          - The 't' and 'x' values present are the same for each 'n',
+          - The order in which the indexes appear is EITHER of the following:
+             - The 'n' index varies 'fast', i.e. the order is:
+                 (t1,x1,0), (t1,x1,1) ... (t1,x1,N-1) \
+                 (t2,x2,0), (t2,x2,1) ... (t2,x2,N-1)  ...
+             - The 'n' index varies 'slowly', i.e. the order is:
+                 (t1,x1,0), (t2,x2,0) ...  \
+                 (t1,x1,1), (t2,x2,1) ...  \
+                 ...                       \
+                 (t1,x2,N-1), (t2,x2,N-1) ...
+            In either case, there does not have to be any particular rhyme or
+            reason to the order of the t and x values; the regularity on 'n' is
+            all that we care about.
+ */
+bool RequestIsDecomposable(const ComputationRequest &request,
+                           ComputationRequest *mini_request,
+                           int32 *num_n_values);
 
-/// This function detects submatrices, matrices, and members of indexes_multi
-/// and indexes that are never used (e.g. due to changes made in other
-/// optimization code), and removes them from the computation by way of suitable
-/// renumbering.  It does not remove no-ops from computation->commands_; to do
-/// that, call RemoveNoOps(computation).
+
+/**
+  This function is used in 'shortcut' compilation to expand a computation
+  that has been compiled for exactly 2 'n' values, to one that is suitable
+  for some num_n_values > 2.
+     @param [in] nnet         The neural network for which this computation
+                              is being built.
+     @param [in] misc_info    The same MiscComputationInfo object that was
+                              present in the ComputationRequests that were
+                              originally used to generate the computation
+                              (required to generated the PrecomputedIndexes)
+     @param [in] computation  The computation that was compiled for exactly
+                              2 'n' values (n=0 and n=1)
+     @param [in] need_debug_info True if we want to retain the 'debug_info'
+                              in the output 'expanded_computation'.  In any
+                              case, the 'debug_info' is required in the
+                              input computation.
+     @param [in] num_n_values The number of 'n' values we want in the output
+                              computation
+     @param [out] expanded_computation  The expanded computation.
+
+ */
+void ExpandComputation(const Nnet &nnet,
+                       const MiscComputationInfo &misc_info,
+                       const NnetComputation &computation,
+                       bool need_debug_info,
+                       int32 num_n_values,
+                       NnetComputation *expanded_computation);
+
+
+
+/// This function detects cases where commands of type kCopyRows, kAddRows or
+/// kAddToRows can be converted to commands of type kMatrixCopy or kMatrixAdd,
+/// and converts them (this may involve adding submatrices).
+///
+/// This function returns true if it made any changes to the computation; if it
+/// returns true, then after doing this you should at some point do
+/// RenumberComputation(), which will remove any now-unused members of
+/// computation->indexes.
+bool ReplaceRowWithMatrixOps(NnetComputation *computation);
+
+/// This function detects cases where commands of type kCopyRows, kAddRows,
+/// kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti or
+/// kAddRowRanges use indexes that start or end with -1's or equivalents,
+/// and replace them with similar commands that act on a sub-matrix of the
+/// matrices they are currently acting on.  This will help efficiency by
+/// avoiding launching unnecessary copies of the kernel (that don't really
+/// have to do anything).
+///
+/// This function returns true if it made any changes to the computation; if it
+/// returns true, then after doing this you should at some point do
+/// RenumberComputation(), which will remove any now-unused members of
+/// computation->indexes.
+bool SnipRowOps(NnetComputation *computation);
+
+/// This function detects submatrices and matrices that are never used (e.g. due
+/// to changes made in other optimization code), and members of indexes,
+/// indexes_multi and indexes_ranges that are unused or are duplicates, and
+/// removes them from the computation by way of suitable renumbering.  It does
+/// not remove no-ops from computation->commands_; to do that, call
+/// RemoveNoOps(computation).
 void RenumberComputation(NnetComputation *computation);
 
 /// Removes commands of type kNoOperation in the computation.
 void RemoveNoOps(NnetComputation *computation);
 
-/// Wherever matrix orig_matrix_index appears in the input of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInInput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
-/// A helper function used in some optimization functions.
-/// Wherever matrix orig_matrix_index appears in the output of the network
-/// (i.e. in computation->input_output_info), replaces it with new_matrix_index.
-/// Returns true if it did replace it.
-bool ReplaceInOutput(
-    const Nnet &nnet, int32 orig_matrix_index, int32 new_matrix_index,
-    NnetComputation *computation);
-
 /// This function outputs to "submatrix_args" the addresses of a subset of
 /// arguments arg1 through arg6 in "command", that correspond to the indexes of
 /// submatrices.  This is useful in renumbering code.  Note: some of the
@@ -572,7 +464,6 @@ void IdentifySubmatrixArgs(NnetComputation::Command *command,
 void IdentifySubmatrixArgs(std::vector<NnetComputation::Command> *commands,
                            std::vector<int32*> *submatrix_args);
 
-
 /// This function outputs to "submatrix_args" the addresses of integers in
 /// 'computation' that correspond to submatrices.  These may be present in
 /// 'commands', and in 'indexes_multi'.  This is useful in renumbering code.
@@ -583,32 +474,6 @@ void IdentifySubmatrixArgsInComputation(NnetComputation *computation,
                                         std::vector<int32*> *submatrix_args);
 
 
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in "command", that correspond to the indexes of
-/// matrices.  This is useful in renumbering code.  (Note: only a few types of
-/// command use matrix indexes).
-void IdentifyMatrixArgs(NnetComputation::Command *command,
-                        std::vector<int32*> *matrix_args);
-
-/// This function outputs to "matrix_args" the addresses of a subset of the
-/// arguments arg1 through arg6 in commands in "commands", that correspond to
-/// the indexes of matrices.  This is useful in renumbering code.  (Note: only a
-/// few types of command use matrix indexes).
-void IdentifyMatrixArgs(std::vector<NnetComputation::Command> *command,
-                        std::vector<int32*> *matrix_args);
-
-/// This function outputs to "matrix_args" the addresses of indexes inside
-/// 'computation' that correspond to matrices.  These live inside
-/// computation->commands and computation->input_output_info; and if
-/// 'include_from_submatrices' is true, then the matrix-indexes present in
-/// computation->submatrices[*].matrix_index will be included too.  Zeros may be
-/// present if there were optional arguments; we do include pointers to them,
-/// but you can just ignore them.
-void IdentifyMatrixArgsInComputation(bool include_from_submatrices,
-                                     NnetComputation *computation,
-                                     std::vector<int32*> *matrix_args);
-
-
 /// Identifies in the vector of commands, arguments that correspond to indexes
 /// into the computation's indexes_multi array, and outputs a list of pointers
 /// to those arguments to 'indexes_multi_args'.  Useful in renumbering code.
@@ -633,8 +498,27 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
+/// This function tries to optimize computation 'computation' for an 'looped'
+/// computation.  It expects as input a computation with no backprop but with
+/// multiple 'segments' separated by command kNoOperation, where each segment
+/// corresponds to a new chunk of input and output.  It tries to locate a pair
+/// of segment boundaries, with command indexes c1 and c2, where the active
+/// matrices have the same debug-info other than a time offset and can be
+/// identified with each other, and the no-op command at c2 can be replaced with
+/// 'got c1', creating a computation that 'goes on forever'.
+/// If it can't do this, it does nothing.  You can figure out that this is the
+/// case by checking whether kGotoLabel is the last command in the computation.
+/// [If this optimization fails, the whole computation may have to be
+/// regenerated with more segments.]
+void OptimizeLoopedComputation(const Nnet &nnet,
+                               NnetComputation *computation);
 
 
+/// This function ensures that the arg1 of a final command of type kGotoLabel is
+/// the same as the command with type kNoOperationLabel.  This is necessary
+/// if you do any other type of optimization after 'OptimizeLoopedComputation()'.
+void FixGotoLabel(NnetComputation *computation);
+
 
 /*
 
@@ -655,4 +539,3 @@ void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
 
 
 #endif
-
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 08a28e22025..abafedc2f2d 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -18,8 +18,10 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
+#include <iomanip>
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-optimize-utils.h"
+#include "base/timer.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -34,7 +36,13 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &propagate_in_place);
   ExpectToken(is, binary, "<BackpropInPlace>");
   ReadBasicType(is, binary, &backprop_in_place);
-  ExpectToken(is, binary, "<ConvertAddition>");
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<OptimizeRowOps>") {
+    ReadBasicType(is, binary, &optimize_row_ops);
+    ReadToken(is, binary, &tok);
+  }
+  KALDI_ASSERT(tok == "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
   ExpectToken(is, binary, "<RemoveAssignments>");
   ReadBasicType(is, binary, &remove_assignments);
@@ -52,7 +60,14 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &min_deriv_time);
   ExpectToken(is, binary, "<MaxDerivTime>");
   ReadBasicType(is, binary, &max_deriv_time);
-  ExpectToken(is, binary, "</NnetOptimizeOptions>");
+  ReadToken(is, binary, &tok);
+  if (tok == "<MaxDerivTimeRelative>") {
+    ReadBasicType(is, binary, &max_deriv_time_relative);
+    ReadToken(is, binary, &tok);
+  }
+
+
+  KALDI_ASSERT(tok == "</NnetOptimizeOptions>");
 }
 
 void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
@@ -65,6 +80,8 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, propagate_in_place);
   WriteToken(os, binary, "<BackpropInPlace>");
   WriteBasicType(os, binary, backprop_in_place);
+  WriteToken(os, binary, "<OptimizeRowOps>");
+  WriteBasicType(os, binary, optimize_row_ops);
   WriteToken(os, binary, "<ConvertAddition>");
   WriteBasicType(os, binary, convert_addition);
   WriteToken(os, binary, "<RemoveAssignments>");
@@ -83,6 +100,8 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, min_deriv_time);
   WriteToken(os, binary, "<MaxDerivTime>");
   WriteBasicType(os, binary, max_deriv_time);
+  WriteToken(os, binary, "<MaxDerivTimeRelative>");
+  WriteBasicType(os, binary, max_deriv_time_relative);
   WriteToken(os, binary, "</NnetOptimizeOptions>");
 }
 
@@ -99,7 +118,8 @@ bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
           other.move_sizing_commands == move_sizing_commands &&
           other.allocate_from_other == allocate_from_other &&
           other.min_deriv_time == min_deriv_time &&
-          other.max_deriv_time == max_deriv_time);
+          other.max_deriv_time == max_deriv_time &&
+          other.max_deriv_time_relative == max_deriv_time_relative);
 }
 
 // move commands that resize matrices to as late/early as possible.
@@ -182,9 +202,8 @@ void RemoveUnnecessaryZeroing(const Nnet &nnet,
       continue;  // nothing to do.
     if (computation->commands[allocate_command].command_type !=
         kAllocMatrixZeroed) {
-      KALDI_ASSERT(computation->commands[allocate_command].command_type ==
-                   kAllocMatrixUndefined);
-      continue;  // already leaving it undefined, so nothing to do.
+      continue;  // already leaving it undefined, or it's an input, so nothing
+                 // to do.
     }
     std::vector<int32> variables_for_matrix;
     a.variables.AppendVariablesForMatrix(matrix_index, &variables_for_matrix);
@@ -283,7 +302,8 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
     if (command.command_type == kAllocMatrixZeroed ||
         command.command_type == kAllocMatrixUndefined ||
         command.command_type == kDeallocMatrix) {
-      int32 m = command.arg1, num_rows = computation->matrices[m].num_rows,
+      int32 s = command.arg1, m = computation->submatrices[s].matrix_index,
+          num_rows = computation->matrices[m].num_rows,
           num_cols = computation->matrices[m].num_cols,
           num_cols_mod = num_cols * (
               computation->matrices[m].stride_type == kDefaultStride ? 1 : -1);
@@ -325,33 +345,22 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
           kAllocMatrixFromOtherZeroed;
   }
   RemoveNoOps(computation);
+  FixGotoLabel(computation);
 }
 
 
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
-                                 const ComputationRequest &request,
                                  NnetComputation *computation) {
   bool changed = true;
   while (changed) {
     changed = false;
-    VariableMergingOptimizer opt(config, nnet, request, computation);
+    VariableMergingOptimizer opt(config, nnet, computation);
     if (opt.MergeVariables())
       changed = true;
   }
 }
 
-// This is a simplified top-level interface to the model-update consolidation
-// code from class ModelUpdateConsolidator.
-void ConsolidateModelUpdate(const Nnet &nnet,
-                            const ComputationRequest &request,
-                            NnetComputation *computation) {
-  if (!request.need_model_derivative)
-    return;   // An optimization; there would be nothing to do in this case.
-  ModelUpdateConsolidator consolidator(nnet, computation);
-  consolidator.ConsolidateModelUpdate();
-}
-
 
 void ConvertAdditionToAssignment(const Nnet &nnet,
                                  NnetComputation *computation) {
@@ -403,96 +412,164 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
   }
 }
 
+
+int32 MaxOutputTimeInRequest(const ComputationRequest &request) {
+  int32 ans = std::numeric_limits<int32>::min();
+  for (size_t i = 0; i < request.outputs.size(); i++) {
+    const std::vector<Index> &indexes (request.outputs[i].indexes);
+    std::vector<Index>::const_iterator iter = indexes.begin(),
+        end = indexes.end();
+    for (; iter != end; ++iter)
+      if (iter->t > ans)
+        ans = iter->t;
+  }
+  if (ans == std::numeric_limits<int32>::min()) {
+    KALDI_ERR << "Failed to find any output indexes in computation request.";
+  }
+  return ans;
+}
+
+
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              const ComputationRequest &request,
+              int32 max_output_time_in_request,
               NnetComputation *computation) {
-  if (!config.optimize)
-    return;
-
   if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
-
-  // this will do nothing unless --min-deriv-time or --max-deriv-time was
-  // set.
-  LimitDerivativeTimes(nnet, config.min_deriv_time, config.max_deriv_time,
-                       computation);
+    CheckComputation(nnet, *computation, true);
+
+  { // Call LimitDerivativeTimes(); it's important that this
+    // should come before other optimizations (search for "insist" in
+    // nnet-optimize-utils.cc for the reasons).
+    // this will do nothing unless --min-deriv-time or --max-deriv-time
+    // or --max-deriv-time-relative was set.
+    int32 max_deriv_time = config.max_deriv_time;
+    if (config.max_deriv_time_relative != std::numeric_limits<int32>::max())
+      max_deriv_time = config.max_deriv_time_relative +
+          max_output_time_in_request;
+    LimitDerivativeTimes(nnet, config.min_deriv_time,
+                         max_deriv_time, computation);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+  if (GetVerboseLevel() >= 3)
+    CheckComputation(nnet, *computation, true);
 
-  if (config.consolidate_model_update)
-    ConsolidateModelUpdate(nnet, request, computation);
+  if (config.optimize && config.consolidate_model_update)
+    ConsolidateModelUpdate(nnet, computation);
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+   if (GetVerboseLevel() >= 3)
+    CheckComputation(nnet, *computation, true);
 
-  if (config.convert_addition)
+  if (config.optimize && config.convert_addition) {
     ConvertAdditionToAssignment(nnet, computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, true);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, true);
+  if (config.optimize &&
+      (config.remove_assignments || config.backprop_in_place ||
+       config.propagate_in_place)) {
+    VariableMergingOptimization(config, nnet, computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (config.remove_assignments || config.backprop_in_place ||
-      config.propagate_in_place)
-    VariableMergingOptimization(config, nnet, request, computation);
+  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
+    bool must_renumber = false;
+    if (config.snip_row_ops && SnipRowOps(computation))
+      must_renumber = true;
+    if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
+      must_renumber = true;
+    if (must_renumber) {
+      RenumberComputation(computation);
+      if (GetVerboseLevel() >= 3)
+        CheckComputation(nnet, *computation, false);
+    }
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
 
-  if (config.initialize_undefined)
+  if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
+    if (GetVerboseLevel() >= 3)
+    CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
-
-  if (config.move_sizing_commands)
+  if (config.optimize && config.move_sizing_commands) {
     MoveSizingCommands(nnet, computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+  // the looped computation optimization has to go before
+  // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
+  // because it's necessary for looped computation to run.
+  if (config.optimize_looped_computation){
+    OptimizeLoopedComputation(nnet, computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (config.allocate_from_other)
+  if (config.optimize && config.allocate_from_other &&
+      !config.optimize_looped_computation) {
+    // Don't do this if it's an looped computation because we're not sure if it
+    // would be correct in that case, as written.  In any case the performance
+    // benefit is tiny.
     RemoveUnnecessaryAllocation(nnet, computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
 
-  if (GetVerboseLevel() >= 4)
-    CheckComputation(nnet, request, *computation, false);
+  // The following is not configurable because it is necessary for
+  // the computation to run correctly (we do it after compilation too,
+  // but the operations may have been put out of order by
+  // other optimizations.)
+  ConsolidateIoOperations(nnet, computation);
+
+  if (config.optimize_looped_computation)
+    FixGotoLabel(computation);
+
+  if (GetVerboseLevel() >= 3)
+    CheckComputation(nnet, *computation, false);
 }
 
 // ComputationRequests are distinguished by the names and indexes
 // of inputs and outputs
-size_t ComputationRequestHasher::operator() (const ComputationRequest *cr) const {
+size_t ComputationRequestHasher::operator() (
+    const ComputationRequest *cr) const noexcept {
   size_t ans = 0;
+  size_t p1 = 4111, p2 = 26951;
+  IoSpecificationHasher io_hasher;
   std::vector<IoSpecification>::const_iterator itr = cr->inputs.begin(),
                                                end = cr->inputs.end();
-  for (; itr != end; ++itr) {
-    ans += IoSpecificationToInt(*itr);
-  }
+  for (; itr != end; ++itr)
+    ans = ans * p1 + io_hasher(*itr);
   itr = cr->outputs.begin();
   end = cr->outputs.end();
-  for (; itr != end; ++itr) {
-    ans += IoSpecificationToInt(*itr);
-  }
+  for (; itr != end; ++itr)
+    ans = ans * p2 + io_hasher(*itr);
   return ans;
 }
 
-size_t ComputationRequestHasher::IoSpecificationToInt(const IoSpecification& spec) const {
-  size_t ans;
-  StringHasher string_hasher;
-  ans = string_hasher(spec.name);
-  std::vector<Index>::const_iterator itr = spec.indexes.begin(),
-                                     end = spec.indexes.end();
-  for (; itr != end; ++itr) {
-    ans += (*itr).n * 1619;
-    ans += (*itr).t * 15649;
-    ans += (*itr).x * 89809;
-  }
-  return ans;
-}
+
+CachingOptimizingCompiler::CachingOptimizingCompiler(
+    const Nnet &nnet,
+    const CachingOptimizingCompilerOptions config):
+    nnet_(nnet), config_(config),
+    seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
+    seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
+    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { }
+
+CachingOptimizingCompiler::CachingOptimizingCompiler(
+    const Nnet &nnet,
+    const NnetOptimizeOptions &opt_config,
+    const CachingOptimizingCompilerOptions config):
+    nnet_(nnet), config_(config), opt_config_(opt_config),
+    seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
+    seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
+    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { }
 
 void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
-                                            NnetComputation *computation) {
-  if (computation_cache_.size() == cache_capacity_) {
+                                            const NnetComputation *computation) {
+  if (computation_cache_.size() == config_.cache_capacity) {
     // full, locate the least-recently-accessed request
     const CacheType::iterator it =
         computation_cache_.find(access_queue_.front());
@@ -527,6 +604,13 @@ void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
       request->Read(is, binary);
       NnetComputation *computation = new NnetComputation();
       computation->Read(is, binary);
+      if (GetVerboseLevel() >= 3) {
+        Timer timer;
+        CheckComputationOptions check_config;
+        ComputationChecker checker(check_config, nnet_, *computation);
+        checker.Check();
+        seconds_taken_check_ += timer.Elapsed();
+      }
       UpdateCache(request, computation);
     }
   }
@@ -559,60 +643,336 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() {
     delete itr->first;
     delete itr->second.first;
   }
+  if (seconds_taken_total_ > 0.0) {
+    std::ostringstream os;
+    double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_
+        - seconds_taken_optimize_ - seconds_taken_expand_
+        - seconds_taken_check_ - seconds_taken_indexes_;
+    os << std::setprecision(3) << seconds_taken_total_
+       << " seconds taken in nnet3 compilation total (breakdown: "
+       << seconds_taken_compile_ << " compilation, "
+       << seconds_taken_optimize_ << " optimization, "
+       << seconds_taken_expand_ << " shortcut expansion, "
+       << seconds_taken_check_ << " checking, "
+       << seconds_taken_indexes_ << " computing indexes, "
+       << seconds_taken_misc << " misc.)";
+    KALDI_LOG << os.str();
+    // note: the leftover amount is misc things like hashing and == comparisons on
+    // computation-requests, and calling RequestIsDecomposable().
+  }
 }
 
 const NnetComputation* CachingOptimizingCompiler::Compile(
     const ComputationRequest  &in_request) {
-  NnetComputation *computation;
-  ComputationRequest *request;
+  Timer timer;
+  const NnetComputation *ans = CompileInternal(in_request);
+  seconds_taken_total_ += timer.Elapsed();
+  return ans;
+}
+
+const NnetComputation* CachingOptimizingCompiler::CompileInternal(
+    const ComputationRequest  &in_request) {
+  const NnetComputation *ans;
   // find computation in the cache
   CacheType::iterator cit = computation_cache_.find(&in_request);
   if (cit == computation_cache_.end()) {
-    // if not found, compile and update cache
-    request = new ComputationRequest;
-    *request = in_request;
-    Compiler compiler(*request, nnet_);
-    CompilerOptions opts;
-    computation = new NnetComputation;
-    compiler.CreateComputation(opts, computation);
-
-    int32 verbose_cutoff = 4;
-    if (GetVerboseLevel() >= verbose_cutoff) {
-      std::ostringstream os1;
-      request->Print(os1);
-      KALDI_LOG << "Computation request is " << os1.str();
-      std::ostringstream os2;
-      computation->Print(os2, nnet_);
-      KALDI_LOG << "Generated computation is: " << os2.str();
-    }
-    { // some checking.
-      CheckComputationOptions check_config;
-      // we can do the rewrite check since it's before optimization.
-      check_config.check_rewrite = true;
-      ComputationChecker checker(check_config, nnet_, *computation);
-      checker.Check();
-    }
-    Optimize(opt_config_, nnet_, *request, computation);
-    if (GetVerboseLevel() >= verbose_cutoff) {
-      std::ostringstream os;
-      computation->Print(os, nnet_);
-      KALDI_LOG << "Optimized computation is: " << os.str();
-    }
-    {  // check the computation again.
-      CheckComputationOptions check_config;
-      ComputationChecker checker(check_config, nnet_, *computation);
-      checker.Check();
-    }
-    computation->ComputeCudaIndexes();
-    UpdateCache(request, computation);
+    ans = CompileAndCache(in_request);
   } else {
     // if found, update access queue
-    computation = cit->second.first;
+    const NnetComputation *computation = cit->second.first;
     UpdateAccessQueue(cit);
+    ans = computation;
+  }
+  return ans;
+}
+
+const NnetComputation* CachingOptimizingCompiler::CompileAndCache(
+    const ComputationRequest  &in_request) {
+  // we need to make a copy of ComputationRequest, because it's stored
+  // as the key in the cache, and we need to own the pointer.
+  ComputationRequest *request = new ComputationRequest(in_request);
+
+  const NnetComputation *computation = CompileViaShortcut(*request);
+  if (computation == NULL)
+    computation = CompileNoShortcut(*request);
+  UpdateCache(request, computation);
+  return computation;
+}
+
+
+const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
+    const ComputationRequest &request) {
+
+  Compiler compiler(request, nnet_);
+  // note: 'opts' only contains 'output_debug_info', which is true by default.
+  // There may be situations where we'd prefer not to keep it, for speed.
+  CompilerOptions opts;
+  NnetComputation *computation = new NnetComputation;
+
+  {
+    Timer timer;
+    compiler.CreateComputation(opts, computation);
+    seconds_taken_compile_ += timer.Elapsed();
+  }
+
+  int32 verbose_cutoff = 4;
+  if (GetVerboseLevel() >= verbose_cutoff) {
+    std::ostringstream os1;
+    request.Print(os1);
+    KALDI_LOG << "Computation request is " << os1.str();
+    std::ostringstream os2;
+    computation->Print(os2, nnet_);
+    KALDI_LOG << "Generated computation is: " << os2.str();
+  }
+  { // some checking.  Note: there may come a time when we might
+    // prefer to disable this checking.
+    Timer timer;
+    CheckComputationOptions check_config;
+    // we can do the rewrite check since it's before optimization.
+    check_config.check_rewrite = true;
+    ComputationChecker checker(check_config, nnet_, *computation);
+    checker.Check();
+    seconds_taken_check_ += timer.Elapsed();
+  }
+
+  {
+    Timer timer;
+    Optimize(opt_config_, nnet_,
+             MaxOutputTimeInRequest(request),
+             computation);
+    seconds_taken_optimize_ += timer.Elapsed();
+  }
+
+
+  if (GetVerboseLevel() >= verbose_cutoff) {
+    std::ostringstream os;
+    computation->Print(os, nnet_);
+    KALDI_LOG << "Optimized computation is: " << os.str();
+  }
+  {  // check the computation again.
+    Timer timer;
+    CheckComputationOptions check_config;
+    ComputationChecker checker(check_config, nnet_, *computation);
+    checker.Check();
+    seconds_taken_check_ += timer.Elapsed();
+  }
+  {
+    Timer timer;
+    computation->ComputeCudaIndexes();
+    seconds_taken_indexes_ += timer.Elapsed();
   }
   return computation;
 }
 
 
+const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
+    const ComputationRequest &request) {
+  if (!config_.use_shortcut)
+    return NULL;
+
+  int32 num_n_values;
+  ComputationRequest mini_request;
+  if (!RequestIsDecomposable(request, &mini_request, &num_n_values))
+    return NULL;
+
+  // By invoking CompileInternal() on the mini request, we go through the same
+  // caching process as for any externally requested computation.  [the only
+  // difference from Compile() is that it doesn't call the timer code; this
+  // avoids double-counting the time taken.]  This pointer will not have to be
+  // deleted by this function; it's owned by the class, in the cache.
+  const NnetComputation *mini_computation = CompileInternal(mini_request);
+
+  // note: by default we always create debug_info, even in regular compilation.
+  // (e.g. it defaults to true in CompilerOptions).  If it really seems to be a
+  // significant overhead, we can revisit this at some point in future.
+  bool need_debug_info = true;
+
+
+  NnetComputation *ans = new NnetComputation();
+
+  {
+    Timer timer;
+    ExpandComputation(nnet_, request.misc_info, *mini_computation,
+                      need_debug_info, num_n_values, ans);
+    seconds_taken_expand_ += timer.Elapsed();
+  }
+  {
+    Timer timer;
+    ans->ComputeCudaIndexes();
+    seconds_taken_indexes_ += timer.Elapsed();
+  }
+  return ans;
+}
+
+
+
+/// Split the computation up into segments bounded by kNoOperationMarker.  For
+/// each segment, a pair of command-indexes (start, end) is output to the vector
+/// 'segments', so the commands in the segment (not including
+/// kNoOperationMarker) are numbered from start ... end - 1.
+static void SplitComputationIntoSegments(
+    const NnetComputation &computation,
+    std::vector<std::pair<int32, int32> > *segments) {
+
+  int32 num_commands = computation.commands.size();
+  segments->clear();
+  int32 cur_start = 0;
+  for (int32 c = 0; c < num_commands; c++) {
+    if (computation.commands[c].command_type == kNoOperationMarker) {
+      segments->push_back(std::pair<int32, int32>(cur_start, c));
+      cur_start = c + 1;
+    }
+  }
+  segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
+}
+
+// This is a helper function used in ConsolidateIoOperations().
+//
+// Suppose we had something like this before ConsolidateIoOperations() (as would
+// be printed by Print()
+
+//  c90: output m50 to user [for node: 'output']
+//  ...
+//  c100: [label for goto statement]
+//  c101: # computation segment separator [e.g., begin backward commands]
+//  ...
+//  c105: m62 = user input [for node: 'input']
+//  ...
+//  c190: output m79 to user [for node: 'output']
+//  ...
+//  c200: goto c100
+//
+//  this would get reordered to the following by ConsolidateIoOperations
+//  (the bulk of the code, before this function is called):
+//
+//  c99: [label for goto statement]
+//  c100: output m50 to user [for node: 'output']
+//  c101: # computation segment separator [e.g., begin backward commands]
+//  c102: m62 = user input [for node: 'input']
+//  ...
+//  c199: goto c199
+//  c200: output m79 to user [for node: 'output']
+//
+// Now command c200 is unreachable, but there is a similar command at c100
+// (after the goto) that will substitute.  However, the matrix indexes are different.
+// So we need to change the above so that the last two commands read:
+//  c199: m50.swap(m79}
+//  c200: goto c199
+void FixGotoOutputReordering(const Nnet &nnet,
+                             NnetComputation *computation) {
+  FixGotoLabel(computation);  // make sure the destination label of the goto statement was
+                              // correct.
+  int32 goto_command_index = -1;
+  for (int32 c = computation->commands.size() - 1; c >= 0; c--)
+    if (computation->commands[c].command_type == kGotoLabel)
+      goto_command_index = c;
+  KALDI_ASSERT(goto_command_index > 0);
+  int32 goto_label_index = computation->commands[goto_command_index].arg1;
+
+  std::vector<int32> output_commands_after_goto,
+      output_commands_after_label;
+  for (int32 c = goto_command_index + 1;
+       c < static_cast<int32>(computation->commands.size()); c++) {
+    KALDI_ASSERT(computation->commands[c].command_type == kProvideOutput);
+    output_commands_after_goto.push_back(c);
+  }
+  for (int32 c = goto_label_index + 1;
+       c < goto_command_index; c++) {  // note: we break from this loop.
+    CommandType t = computation->commands[c].command_type;
+    if (t == kProvideOutput)
+      output_commands_after_label.push_back(c);
+    else if (t != kNoOperationMarker && t != kAcceptInput)
+      break;
+  }
+  if (output_commands_after_goto.size() != output_commands_after_label.size()) {
+    computation->Print(std::cerr, nnet);
+    KALDI_ERR << "Could not fix goto/output reordering, size mismatch.";
+  }
+  NnetComputation::Command goto_command = computation->commands[goto_command_index];
+  // be we'll be replacing the final kProvideOutput commands with
+  // kAllocMatrixFromOther [i.e. swap commands], and moving them one command
+  // backward; later we'll put the goto command at the end.
+  for (size_t i = 0; i < output_commands_after_goto.size(); i++) {
+    int32 c1 = output_commands_after_label[i],
+        c2 = output_commands_after_goto[i],
+        new_c2 = c2 - 1;
+    int32 s1 = computation->commands[c1].arg1,
+        s2 = computation->commands[c2].arg1;
+    // The following assert checks that the network node-index is the same...
+    // the idea is that the outputs should have been provided in the same order.
+    // I can think of no reason why the order might be different.
+    KALDI_ASSERT(computation->commands[c1].arg2 ==
+                 computation->commands[c1].arg2);
+    computation->commands[new_c2].command_type = kAllocMatrixFromOther;
+    computation->commands[new_c2].arg1 = s1;
+    computation->commands[new_c2].arg2 = s2;
+  }
+  // ... and move the goto command to the end.
+  computation->commands.back() = goto_command;
+}
+
+
+void ConsolidateIoOperations(const Nnet &nnet,
+                             NnetComputation *computation) {
+  bool ends_with_goto =
+      (!computation->commands.empty() &&
+       computation->commands.back().command_type == kGotoLabel);
+
+  // These segments, represented as (start-index, end-index),
+  // are segments of the computation separated by kNoOperationMarker.
+  std::vector<std::pair<int32, int32> > segments;
+  SplitComputationIntoSegments(*computation, &segments);
+
+  int32 num_commands = computation->commands.size();
+  std::vector<NnetComputation::Command> reordered_commands(num_commands);
+  // put kNoOperationMarker between all segments in the reordered commands.
+  for (size_t s = 0; s + 1 < segments.size(); s++)
+    reordered_commands[segments[s].second].command_type = kNoOperationMarker;
+
+  // for each segment we'll divide the commands up into those that must appear
+  // at the left of the segment (kAcceptInput for inputs and output-derivs), those
+  // that must appear in the middle (most commands), those that must appear
+  // on the right (kProvideOutput for output nodes and input derivatives).
+  std::vector<int32> left_commands, middle_commands, right_commands;
+
+  for (size_t s = 0; s < segments.size(); s++) {
+    int32 segment_start = segments[s].first,
+        segment_end = segments[s].second;
+    left_commands.clear();
+    middle_commands.clear();
+    right_commands.clear();
+    for (int32 c = segment_start; c < segment_end; c++) {
+      if (computation->commands[c].command_type == kProvideOutput) {
+        right_commands.push_back(c);
+      } else if (computation->commands[c].command_type == kAcceptInput) {
+        left_commands.push_back(c);
+      } else {
+        middle_commands.push_back(c);
+      }
+    }
+    std::vector<int32>::const_iterator iter = left_commands.begin(),
+        end = left_commands.end();
+    int32 c = segment_start;
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    iter = middle_commands.begin();
+    end = middle_commands.end();
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    iter = right_commands.begin();
+    end = right_commands.end();
+    for (; iter != end; ++iter, ++c)
+      reordered_commands[c] = computation->commands[*iter];
+    KALDI_ASSERT(c == segment_end);
+  }
+  computation->commands.swap(reordered_commands);
+
+  if (ends_with_goto)
+    FixGotoOutputReordering(nnet, computation);
+}
+
+
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index e04aff302c9..cb14060996b 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -1,7 +1,7 @@
 // nnet3/nnet-optimize.h
 
-// Copyright      2015  Johns Hopkins University (author: Daniel Povey)
-//                2015  Xiaohui Zhang
+// Copyright      2015-2016  Johns Hopkins University (author: Daniel Povey)
+//                2015       Xiaohui Zhang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -29,7 +29,7 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Options class for optimizing a NnetComputation The main projected use for
+// Options class for optimizing a NnetComputation.  The main projected use for
 // this is in debugging the optimization code itself, so that if an error is
 // detected, we can work out which optimization was responsible for the error.
 struct NnetOptimizeOptions {
@@ -37,6 +37,7 @@ struct NnetOptimizeOptions {
   bool consolidate_model_update;
   bool propagate_in_place;
   bool backprop_in_place;
+  bool optimize_row_ops;
   bool convert_addition;
   bool remove_assignments;
   bool allow_left_merge;
@@ -46,20 +47,31 @@ struct NnetOptimizeOptions {
   bool allocate_from_other;
   int32 min_deriv_time;
   int32 max_deriv_time;
-
-  NnetOptimizeOptions(): optimize(true),
-                         consolidate_model_update(true),
-                         propagate_in_place(true),
-                         backprop_in_place(true),
-                         convert_addition(true),
-                         remove_assignments(true),
-                         allow_left_merge(true),
-                         allow_right_merge(true),
-                         initialize_undefined(true),
-                         move_sizing_commands(true),
-                         allocate_from_other(true),
-                         min_deriv_time(std::numeric_limits<int32>::min()),
-                         max_deriv_time(std::numeric_limits<int32>::max()) { }
+  int32 max_deriv_time_relative;
+  bool snip_row_ops;
+  // optimize_looped_computation is a 'hidden config' not available from
+  // the command line; it's set to true to enable the optimization for
+  // looped computation that turns a linear computation into a loop.
+  bool optimize_looped_computation;
+
+  NnetOptimizeOptions():
+      optimize(true),
+      consolidate_model_update(true),
+      propagate_in_place(true),
+      backprop_in_place(true),
+      optimize_row_ops(true),
+      convert_addition(true),
+      remove_assignments(true),
+      allow_left_merge(true),
+      allow_right_merge(true),
+      initialize_undefined(true),
+      move_sizing_commands(true),
+      allocate_from_other(true),
+      min_deriv_time(std::numeric_limits<int32>::min()),
+      max_deriv_time(std::numeric_limits<int32>::max()),
+      max_deriv_time_relative(std::numeric_limits<int32>::max()),
+      snip_row_ops(true),
+      optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
     opts->Register("optimize", &optimize, "Set this to false to turn off all "
@@ -72,6 +84,9 @@ struct NnetOptimizeOptions {
                    "disable optimization that allows in-place propagation");
     opts->Register("backprop-in-place", &backprop_in_place, "Set to false to "
                    "disable optimization that allows in-place backprop");
+    opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to "
+                   "disable certain optimizations that act on operations of "
+                   "type *Row*.");
     opts->Register("convert-addition", &convert_addition, "Set to false to "
                    "disable the optimization that converts Add commands into "
                    "Copy commands wherever possible.");
@@ -99,26 +114,61 @@ struct NnetOptimizeOptions {
                    "the maximum t value that you want derivatives to be computed "
                    "at when updating the model.  This is an optimization that "
                    "saves time in the backprop phase for recurrent frameworks");
+    opts->Register("max-deriv-time-relative", &max_deriv_time_relative,
+                   "An alternative mechanism for setting the --max-deriv-time, "
+                   "suitable for situations where the length of the egs is "
+                   "variable.  If set, it is equivalent to setting the "
+                   "--max-deriv-time to this value plus the largest 't' value "
+                   "in any 'output' node of the computation request.");
+    opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
+                   "disable an optimization that reduces the size of certain "
+                   "per-row operations");
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;
   bool operator == (const NnetOptimizeOptions &other) const;
 };
 
-/// This is the top-level function for optimizing a computation.
+
+/* This utility function, used in code that calls LimitDerivativeTimes() (and
+   required in code that calls Optimize(), returns the largest time
+   't' in any of the 'outputs' in the computation request, or crashes if there
+   are no outputs (or no cindexes in those outputs). */
+int32 MaxOutputTimeInRequest(const ComputationRequest &request);
+
+
+/** This is the top-level function for optimizing a computation.  Note: it
+    should really be called OptimizeAndPostprocess(), because there is at least
+    one thing it does (reordering I/O commands) that is necessary for a
+    computation to be run.
+
+    @param [in] config   The options that control, among other things,
+                         which optimizations to apply.
+    @param [in] nnet     The neural net for which the computation is being built
+    @param [in] max_output_time_in_request  This value is only needed when the
+                         max-deriv-time-relative config value is set in
+                         'config'.  It should be set to the largest 't' value
+                         encountered in any of the indexes in the 'output'
+                         IoSpecifications in the ComputationRequests used to
+                         compile the computation.  However if there are multiple
+                         ComputationRequests (i.e. it was an online computation)
+                         you can just set it to any value you want, because
+                         backpropagation is not supported so the
+                         max-deriv-time-relative configuration value would not
+                         have any effect.
+    @param [in,out] computation  The computation to be optimized; this function
+                         modifies it in-place.
+ */
 void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
-              const ComputationRequest &request,
+              int32 max_output_time_in_request,
               NnetComputation *computation);
 
 // Hash function for ComputationRequest. It converts
 // ComputationRequest to hash code by looking at input
 // and output IoSpecifications vectors.
 struct ComputationRequestHasher {
-  size_t operator()(const ComputationRequest *cr) const;
- private:
-  size_t IoSpecificationToInt(const IoSpecification& spec) const;
-  static const int kPrime = 7853;
+  size_t operator()(const ComputationRequest *cr) const noexcept;
 };
 
 // Equality function for ComputationRequest pointer
@@ -130,20 +180,44 @@ struct ComputationRequestPtrEqual {
   }
 };
 
+
+
+struct CachingOptimizingCompilerOptions {
+  bool use_shortcut;
+  int32 cache_capacity;
+
+  CachingOptimizingCompilerOptions():
+      use_shortcut(true),
+      cache_capacity(64) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("use-shortcut", &use_shortcut,
+                   "If true, use the 'shortcut' in compilation whereby "
+                   "computation requests with regular structure are identified "
+                   "as such, a computation with a smaller number of distinct "
+                   "values of 'n' is compiled (e.g. 2), and the compiled "
+                   "computation is expanded to match the size of the real "
+                   "computation request.");
+    opts->Register("cache-capacity", &cache_capacity,
+                   "Determines how many computations the computation-cache will "
+                   "store (most-recently-used).");
+  }
+};
+
 /// This class enables you to do the compilation and optimization in one call,
 /// and also ensures that if the ComputationRequest is identical to the previous
 /// one, the compilation process is not repeated.
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
-                           const int32 capacity = 20):
-      nnet_(nnet), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions config =
+                            CachingOptimizingCompilerOptions());
 
   /// Note: nnet is retained as a const reference but opt_config is copied.
   CachingOptimizingCompiler(const Nnet &nnet,
                             const NnetOptimizeOptions &opt_config,
-                            const int32 capacity = 20):
-      nnet_(nnet), opt_config_(opt_config), cache_capacity_(capacity) { }
+                            const CachingOptimizingCompilerOptions config =
+                            CachingOptimizingCompilerOptions());
 
   ~CachingOptimizingCompiler();
   /// Does the compilation and returns a const pointer to
@@ -154,7 +228,40 @@ class CachingOptimizingCompiler {
   void ReadCache(std::istream &is, bool binary);
   void WriteCache(std::ostream &os, bool binary) const;
  private:
+
+  // This function just implements the work of Compile(); it's made a separate
+  // function for the convenience of the timer code, to avoid it being called
+  // twice (we also call this function directly from inside the class).
+  const NnetComputation* CompileInternal(const ComputationRequest &request);
+
+  // This function, called from CompileInternal(), is called when a
+  // ComputationRequest has been determined not to have already been cached.  It
+  // otherwise has the same interface as CompileInternal(), but assumes that
+  // there is nothing cached for this computation as yet.  It compiles the
+  // computation and takes care of caching it.
+  const NnetComputation* CompileAndCache(const ComputationRequest &request);
+
+
+  // This function, called from CompileAndCache(), tries to compile the
+  // ComputationRequest 'request' via 'shortcut' compilation; if this is
+  // possible, it returns a pointer to a newly allocated computation that it has
+  // compiled this way (note: this computation will not yet have been placed in
+  // the computation cache).  If this is not possible for some reason
+  // (e.g. shortcut compilation is disabled in the config; or the computation
+  // request was not decomposable because of too few n values or irregular or
+  // unexpected structure), this function returns NULL and you should compile
+  // via CompileNoShortcut.
+  const NnetComputation* CompileViaShortcut(const ComputationRequest &request);
+
+  // This function, called from CompileAndCache(), tries to compile the
+  // ComputationRequest 'request' via the regular (not shortcut) compilation
+  // process; it returns a pointer to a newly allocated computation that it has
+  // compiled this way (note: this computation will not yet have been placed in
+  // the computation cache).
+  const NnetComputation* CompileNoShortcut(const ComputationRequest &request);
+
   const Nnet &nnet_;
+  CachingOptimizingCompilerOptions config_;
   NnetOptimizeOptions opt_config_;
 
   // The access queue for keeping track of the freshness of computation.
@@ -169,22 +276,28 @@ class CachingOptimizingCompiler {
   // Map from computation-request to pair of (computation, and position in
   // access_queue_). Used for fast lookup of previously compiled computations.
   // All pointers are owned here.
-  typedef unordered_map<const ComputationRequest*, std::pair<NnetComputation*,
-    AqType::iterator>, ComputationRequestHasher,
-    ComputationRequestPtrEqual> CacheType;
+  typedef unordered_map<const ComputationRequest*,
+                        std::pair<const NnetComputation*, AqType::iterator>,
+                        ComputationRequestHasher,
+                        ComputationRequestPtrEqual> CacheType;
   CacheType computation_cache_;
 
-  // This function updates the computation cache. It is called within Compile().
-  // It takes ownership of the pointers.  It inserts the request at the end of
-  // the queue, and purges the least-recently-accessed request from the queue and
-  // the cache if the capacity is reached.
+  // seconds spent in various phases of compilation-- for diagnostic messages
+  double seconds_taken_total_;
+  double seconds_taken_compile_;
+  double seconds_taken_optimize_;
+  double seconds_taken_expand_;
+  double seconds_taken_check_;
+  double seconds_taken_indexes_;
+
+  // This function updates the computation cache. It is called within
+  // CompileInternal().  It takes ownership of the pointers.  It inserts the
+  // request at the end of the queue, and purges the least-recently-accessed
+  // request from the queue and the cache if the capacity is reached.
   void UpdateCache(const ComputationRequest *request,
-                   NnetComputation *computation);
+                   const NnetComputation *computation);
   // This function updates the recently accessed queue.
   void UpdateAccessQueue(CacheType::iterator &cit);
-  // This configuration value determines how many unique Computations
-  // to cache in our most-recently-used cache.
-  int32 cache_capacity_;
 };
 
 
@@ -228,7 +341,6 @@ void LimitDerivativeTimes(const Nnet &nnet,
 /// class ModelUpdateConsolidator.  Will fail if called a
 /// second time.
 void ConsolidateModelUpdate(const Nnet &nnet,
-                            const ComputationRequest &request,
                             NnetComputation *computation);
 
 /// This converts addition operations (things with Add in their names) to
@@ -241,7 +353,6 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
 /// This wraps class VariableMergingOptimizer in a simplified interface.
 void VariableMergingOptimization(const NnetOptimizeOptions &config,
                                  const Nnet &nnet,
-                                 const ComputationRequest &request,
                                  NnetComputation *computation);
 
 
@@ -261,6 +372,17 @@ void RemoveUnnecessaryAllocation(const Nnet &nnet,
                                  NnetComputation *computation);
 
 
+/// This optimization puts the input operations (kAcceptInput) and output
+/// operations (kProvideOutput) at the very beginning or end of segments of
+/// computation, respectively.
+///
+/// This is actually necessary for computations to be run easily, because if these
+/// commands were interspersed with the regular commands, you'd have to
+/// call computer.Run() between the individual AcceptInput() and GetOutput()
+/// function calls.
+void ConsolidateIoOperations(const Nnet &nnet,
+                             NnetComputation *computation);
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 5935b4dacad..2692eb7271c 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -87,34 +87,27 @@ void PnormComponent::Write(std::ostream &os, bool binary) const {
 }
 
 
-void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion,
-                            bool dropout_per_frame) {
+void DropoutComponent::Init(int32 dim, BaseFloat dropout_proportion) {
   dropout_proportion_ = dropout_proportion;
-  dropout_per_frame_ = dropout_per_frame;
   dim_ = dim;
 }
 
 void DropoutComponent::InitFromConfig(ConfigLine *cfl) {
   int32 dim = 0;
   BaseFloat dropout_proportion = 0.0;
-  bool dropout_per_frame = false;
   bool ok = cfl->GetValue("dim", &dim) &&
     cfl->GetValue("dropout-proportion", &dropout_proportion);
-  cfl->GetValue("dropout-per-frame", &dropout_per_frame);
-    // for this stage, dropout is hard coded in
-    // normal mode if not declared in config
   if (!ok || cfl->HasUnusedValues() || dim <= 0 ||
       dropout_proportion < 0.0 || dropout_proportion > 1.0)
-       KALDI_ERR << "Invalid initializer for layer of type "
-                 << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(dim, dropout_proportion, dropout_per_frame);
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+  Init(dim, dropout_proportion);
 }
 
 std::string DropoutComponent::Info() const {
   std::ostringstream stream;
   stream << Type() << ", dim=" << dim_
-         << ", dropout-proportion=" << dropout_proportion_
-         << ", dropout-per-frame=" << (dropout_per_frame_ ? "true" : "false");
+         << ", dropout-proportion=" << dropout_proportion_;
   return stream.str();
 }
 
@@ -126,29 +119,16 @@ void DropoutComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   BaseFloat dropout = dropout_proportion_;
   KALDI_ASSERT(dropout >= 0.0 && dropout <= 1.0);
-  if (!dropout_per_frame_) {
-    // This const_cast is only safe assuming you don't attempt
-    // to use multi-threaded code with the GPU.
-    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-    out->Add(-dropout);  // now, a proportion "dropout" will be <0.0
-    // apply the function (x>0?1:0).  Now, a proportion
-    // "dropout" will be zero and (1 - dropout) will be 1.0.
-    out->ApplyHeaviside();
+  // This const_cast is only safe assuming you don't attempt
+  // to use multi-threaded code with the GPU.
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
 
-    out->MulElements(in);
-  } else {
-    // randomize the dropout matrix by row,
-    // i.e. [[1,1,1,1],[0,0,0,0],[0,0,0,0],[1,1,1,1],[0,0,0,0]]
-    CuMatrix<BaseFloat> tmp(1, out->NumRows(), kUndefined);
-    // This const_cast is only safe assuming you don't attempt
-    // to use multi-threaded code with the GPU.
-    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&tmp);
-    tmp.Add(-dropout);
-    tmp.ApplyHeaviside();
-    out->CopyColsFromVec(tmp.Row(0));
-    out->MulElements(in);
-  }
+  out->Add(-dropout); // now, a proportion "dropout" will be <0.0
+  out->ApplyHeaviside(); // apply the function (x>0?1:0).  Now, a proportion "dropout" will
+                         // be zero and (1 - dropout) will be 1.0.
+
+  out->MulElements(in);
 }
 
 
@@ -170,25 +150,11 @@ void DropoutComponent::Backprop(const std::string &debug_info,
 
 
 void DropoutComponent::Read(std::istream &is, bool binary) {
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<DropoutComponent>") {
-    ReadToken(is, binary, &token);
-  }
-  KALDI_ASSERT(token == "<Dim>");
-  ReadBasicType(is, binary, &dim_);  // read dimension.
-  ReadToken(is, binary, &token);
-  KALDI_ASSERT(token == "<DropoutProportion>");
-  ReadBasicType(is, binary, &dropout_proportion_);  // read dropout rate
-  ReadToken(is, binary, &token);
-  if (token == "<DropoutPerFrame>") {
-    ReadBasicType(is, binary, &dropout_per_frame_);  // read dropout mode
-    ReadToken(is, binary, &token);
-    KALDI_ASSERT(token == "</DropoutComponent>");
-  } else {
-    dropout_per_frame_ = false;
-    KALDI_ASSERT(token == "</DropoutComponent>");
-  }
+  ExpectOneOrTwoTokens(is, binary, "<DropoutComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<DropoutProportion>");
+  ReadBasicType(is, binary, &dropout_proportion_);
+  ExpectToken(is, binary, "</DropoutComponent>");
 }
 
 void DropoutComponent::Write(std::ostream &os, bool binary) const {
@@ -197,8 +163,6 @@ void DropoutComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dim_);
   WriteToken(os, binary, "<DropoutProportion>");
   WriteBasicType(os, binary, dropout_proportion_);
-  WriteToken(os, binary, "<DropoutPerFrame>");
-  WriteBasicType(os, binary, dropout_per_frame_);
   WriteToken(os, binary, "</DropoutComponent>");
 }
 
@@ -485,48 +449,10 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
                                   const CuMatrixBase<BaseFloat> &out_deriv,
                                   Component *to_update,
                                   CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)  return;
-  const CuSubMatrix<BaseFloat> out_deriv_no_log(out_deriv,
-                                                0, out_deriv.NumRows(),
-                                                0, input_dim_);
-  CuVector<BaseFloat> dot_products(out_deriv.NumRows());
-  dot_products.AddDiagMatMat(1.0, out_deriv_no_log, kNoTrans,
-                             in_value, kTrans, 0.0);
-  CuVector<BaseFloat> in_norm(in_value.NumRows());
-  BaseFloat d_scaled = (in_value.NumCols() * target_rms_ * target_rms_);
-  in_norm.AddDiagMat2(1.0, in_value, kNoTrans, 0.0);
-
-  if (add_log_stddev_) {
-    CuVector<BaseFloat> log_stddev_deriv(in_norm), // log_stddev deriv as dF/dy .* (x^T x)^-1
-        out_deriv_for_stddev(out_deriv.NumRows(), kUndefined);
-    // f = log(sqrt(max(epsi, x^T x / D)))
-    // df/dx = epsi^2 * D < x^T x ? (1/(x^T x)) * x  : 0.
-    // we don't compute this exactly below for the case wehn x^2 x is very
-    // small, but we do make sure that the deriv isn't infinity when the input
-    // is zero.
-    log_stddev_deriv.ApplyFloor(input_dim_ * kSquaredNormFloor);
-    log_stddev_deriv.ApplyPow(-1.0);
-    out_deriv_for_stddev.CopyColFromMat(out_deriv, (out_deriv.NumCols() - 1));
-    log_stddev_deriv.MulElements(out_deriv_for_stddev);
-    if (in_deriv)
-      in_deriv->AddDiagVecMat(1.0, log_stddev_deriv, in_value, kNoTrans, 1.0);
-  }
-  in_norm.Scale(1.0 / d_scaled);
-  in_norm.ApplyFloor(kSquaredNormFloor);
-  in_norm.ApplyPow(-0.5);
-  if (in_deriv) {
-    if (in_deriv->Data() != out_deriv_no_log.Data())
-      in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
-    else
-      in_deriv->MulRowsVec(in_norm);
-    in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
-    in_norm.ApplyPow(3.0);
-    dot_products.MulElements(in_norm);
-
-    in_deriv->AddDiagVecMat(-1.0 / d_scaled,
-                            dot_products, in_value,
-                            kNoTrans, 1.0);
-  }
+  if (!in_deriv)
+    return;
+  cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_,
+                          in_deriv);
 }
 
 void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -1172,8 +1098,14 @@ void RectifiedLinearComponent::StoreStats(
 }
 
 void AffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    // If scale == 0.0 we call SetZero() which will get rid of NaN's and inf's.
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void AffineComponent::Resize(int32 input_dim, int32 output_dim) {
@@ -1205,17 +1137,6 @@ AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                bias_params.Dim() != 0);
 }
 
-
-
-void AffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void AffineComponent::SetParams(const VectorBase<BaseFloat> &bias,
                                 const MatrixBase<BaseFloat> &linear) {
   bias_params_ = bias;
@@ -1461,8 +1382,13 @@ RepeatedAffineComponent::RepeatedAffineComponent(const RepeatedAffineComponent &
 
 
 void RepeatedAffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -1473,15 +1399,6 @@ void RepeatedAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-void RepeatedAffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void RepeatedAffineComponent::PerturbParams(BaseFloat stddev){
   CuMatrix<BaseFloat> temp_linear_params(linear_params_);
   temp_linear_params.SetRandn();
@@ -1550,7 +1467,7 @@ void RepeatedAffineComponent::InitFromConfig(ConfigLine *cfl) {
        num_repeats, param_stddev, bias_mean, bias_stddev);
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	          << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -1968,8 +1885,13 @@ void BlockAffineComponent::Backprop(const std::string &debug_info,
 }
 
 void BlockAffineComponent::Scale(BaseFloat scale) {
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -1980,15 +1902,6 @@ void BlockAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
   bias_params_.AddVec(alpha, other->bias_params_);
 }
 
-void BlockAffineComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  linear_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void BlockAffineComponent::PerturbParams(BaseFloat stddev) {
   CuMatrix<BaseFloat> temp_linear_params(linear_params_);
   temp_linear_params.SetRandn();
@@ -2053,7 +1966,11 @@ void BlockAffineComponent::UnVectorize(const VectorBase<BaseFloat> &params) {
 }
 
 void PerElementScaleComponent::Scale(BaseFloat scale) {
-  scales_.Scale(scale);
+  if (scale == 0.0) {
+    scales_.SetZero();
+  } else {
+    scales_.Scale(scale);
+  }
 }
 
 void PerElementScaleComponent::Add(BaseFloat alpha,
@@ -2069,14 +1986,6 @@ PerElementScaleComponent::PerElementScaleComponent(
     UpdatableComponent(component),
     scales_(component.scales_) { }
 
-void PerElementScaleComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  scales_.SetZero();
-}
-
 void PerElementScaleComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_scales(scales_.Dim(), kUndefined);
   temp_scales.SetRandn();
@@ -2216,7 +2125,11 @@ void PerElementScaleComponent::UnVectorize(
 }
 
 void PerElementOffsetComponent::Scale(BaseFloat scale) {
-  offsets_.Scale(scale);
+  if (scale == 0.0) {
+    offsets_.SetZero();
+  } else {
+    offsets_.Scale(scale);
+  }
 }
 
 
@@ -2233,14 +2146,6 @@ PerElementOffsetComponent::PerElementOffsetComponent(
     UpdatableComponent(component),
     offsets_(component.offsets_) { }
 
-void PerElementOffsetComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  offsets_.SetZero();
-}
-
 void PerElementOffsetComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_offsets(offsets_.Dim(), kUndefined);
   temp_offsets.SetRandn();
@@ -2378,13 +2283,12 @@ std::string ConstantFunctionComponent::Info() const {
 }
 
 ConstantFunctionComponent::ConstantFunctionComponent():
-    UpdatableComponent(), input_dim_(-1), is_updatable_(true),
-    use_natural_gradient_(true) { }
+    input_dim_(-1), is_updatable_(true), use_natural_gradient_(true) { }
 
 ConstantFunctionComponent::ConstantFunctionComponent(
     const ConstantFunctionComponent &other):
-    UpdatableComponent(other), input_dim_(other.input_dim_),
-    output_(other.output_), is_updatable_(other.is_updatable_),
+    input_dim_(other.input_dim_), output_(other.output_),
+    is_updatable_(other.is_updatable_),
     use_natural_gradient_(other.use_natural_gradient_),
     preconditioner_(other.preconditioner_) { }
 
@@ -2484,8 +2388,13 @@ Component* ConstantFunctionComponent::Copy() const {
 }
 
 void ConstantFunctionComponent::Scale(BaseFloat scale) {
-  if (is_updatable_)
-    output_.Scale(scale);
+  if (is_updatable_) {
+    if (scale == 0.0) {
+      output_.SetZero();
+    } else {
+      output_.Scale(scale);
+    }
+  }
 }
 
 void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -2497,14 +2406,6 @@ void ConstantFunctionComponent::Add(BaseFloat alpha, const Component &other_in)
   }
 }
 
-void ConstantFunctionComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  output_.SetZero();
-}
-
 void ConstantFunctionComponent::PerturbParams(BaseFloat stddev) {
   CuVector<BaseFloat> temp_output(output_.Dim(), kUndefined);
   temp_output.SetRandn();
@@ -2860,11 +2761,19 @@ void NaturalGradientAffineComponent::ZeroStats()  {
 }
 
 void NaturalGradientAffineComponent::Scale(BaseFloat scale) {
-  update_count_ *= scale;
-  max_change_scale_stats_ *= scale;
-  active_scaling_count_ *= scale;
-  linear_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    update_count_ = 0.0;
+    max_change_scale_stats_ = 0.0;
+    active_scaling_count_ = 0.0;
+    linear_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    update_count_ *= scale;
+    max_change_scale_stats_ *= scale;
+    active_scaling_count_ *= scale;
+    linear_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other_in) {
@@ -3652,7 +3561,7 @@ void ConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   }
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
 }
@@ -3741,7 +3650,8 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                               kUndefined);
   InputToInputPatches(in, &patches);
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
   std::vector<CuSubMatrix<BaseFloat>* > tgt_batch, patch_batch,
       filter_params_batch;
 
@@ -3770,8 +3680,13 @@ void ConvolutionComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
 // scale the parameters
 void ConvolutionComponent::Scale(BaseFloat scale) {
-  filter_params_.Scale(scale);
-  bias_params_.Scale(scale);
+  if (scale == 0.0) {
+    filter_params_.SetZero();
+    bias_params_.SetZero();
+  } else {
+    filter_params_.Scale(scale);
+    bias_params_.Scale(scale);
+  }
 }
 
 // add another convolution component
@@ -3895,9 +3810,10 @@ void ConvolutionComponent::Backprop(const std::string &debug_info,
                                        kSetZero);
 
   std::vector<CuSubMatrix<BaseFloat>* > patch_deriv_batch, out_deriv_batch,
-      filter_params_batch;
+	  filter_params_batch;
   CuSubMatrix<BaseFloat>* filter_params_elem = new CuSubMatrix<BaseFloat>(
-      filter_params_, 0, filter_params_.NumRows(), 0, filter_params_.NumCols());
+		  filter_params_, 0, filter_params_.NumRows(), 0,
+		  filter_params_.NumCols());
 
   for (int32 x_step = 0; x_step < num_x_steps; x_step++)  {
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
@@ -3974,8 +3890,9 @@ void ConvolutionComponent::Update(const std::string &debug_info,
     for (int32 y_step = 0; y_step < num_y_steps; y_step++)  {
       int32 patch_number = x_step * num_y_steps + y_step;
       filters_grad_batch.push_back(new CuSubMatrix<BaseFloat>(
-          filters_grad_blocks_batch.RowRange(
-              patch_number * filters_grad.NumRows(), filters_grad.NumRows())));
+              filters_grad_blocks_batch.RowRange(
+				      patch_number * filters_grad.NumRows(),
+				    filters_grad.NumRows())));
 
       input_patch_batch.push_back(new CuSubMatrix<BaseFloat>(
               input_patches.ColRange(patch_number * filter_dim, filter_dim)));
@@ -4010,15 +3927,6 @@ void ConvolutionComponent::Update(const std::string &debug_info,
   bias_params_.AddVec(learning_rate_, bias_grad);
 }
 
-void ConvolutionComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  filter_params_.SetZero();
-  bias_params_.SetZero();
-}
-
 void ConvolutionComponent::Read(std::istream &is, bool binary) {
   ReadUpdatableCommon(is, binary);  // Read opening tag and learning rate.
   ExpectToken(is, binary, "<InputXDim>");
@@ -4447,7 +4355,7 @@ void PermuteComponent::InitFromConfig(ConfigLine *cfl) {
               << column_map_str;
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
+	      << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
@@ -4830,18 +4738,6 @@ void CompositeComponent::Add(BaseFloat alpha, const Component &other_in) {
     components_[i]->Add(alpha, *(other->components_[i]));
 }
 
-// virtual
-void CompositeComponent::SetZero(bool treat_as_gradient) {
-  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
-  for (size_t i = 0; i < components_.size(); i++) {
-    if (components_[i]->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc =
-          dynamic_cast<UpdatableComponent*>(components_[i]);
-      uc->SetZero(treat_as_gradient);
-    }
-  }
-}
-
 // virtual
 void CompositeComponent::PerturbParams(BaseFloat stddev) {
   KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
@@ -4882,6 +4778,19 @@ void CompositeComponent::SetActualLearningRate(BaseFloat lrate) {
   }
 }
 
+// virtual
+void CompositeComponent::SetAsGradient() {
+  KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
+  UpdatableComponent::SetAsGradient();
+  for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Properties() & kUpdatableComponent) {
+      UpdatableComponent *uc =
+          dynamic_cast<UpdatableComponent*>(components_[i]);
+      uc->SetAsGradient();
+    }
+  }
+}
+
 // virtual
 int32 CompositeComponent::NumParameters() const {
   KALDI_ASSERT(this->IsUpdatable());  // or should not be called.
@@ -5144,19 +5053,20 @@ Component* LstmNonlinearityComponent::Copy() const {
   return new LstmNonlinearityComponent(*this);
 }
 
-void LstmNonlinearityComponent::ZeroStats() {
-  value_sum_.SetZero();
-  deriv_sum_.SetZero();
-  self_repair_total_.SetZero();
-  count_ = 0.0;
-}
-
 void LstmNonlinearityComponent::Scale(BaseFloat scale) {
-  params_.Scale(scale);
-  value_sum_.Scale(scale);
-  deriv_sum_.Scale(scale);
-  self_repair_total_.Scale(scale);
-  count_ *= scale;
+  if (scale == 0.0) {
+    params_.SetZero();
+    value_sum_.SetZero();
+    deriv_sum_.SetZero();
+    self_repair_total_.SetZero();
+    count_ = 0.0;
+  } else {
+    params_.Scale(scale);
+    value_sum_.Scale(scale);
+    deriv_sum_.Scale(scale);
+    self_repair_total_.Scale(scale);
+    count_ *= scale;
+  }
 }
 
 void LstmNonlinearityComponent::Add(BaseFloat alpha,
@@ -5171,18 +5081,6 @@ void LstmNonlinearityComponent::Add(BaseFloat alpha,
   count_ += alpha * other->count_;
 }
 
-void LstmNonlinearityComponent::SetZero(bool treat_as_gradient) {
-  if (treat_as_gradient) {
-    SetActualLearningRate(1.0);
-    is_gradient_ = true;
-  }
-  params_.SetZero();
-  value_sum_.SetZero();
-  deriv_sum_.SetZero();
-  self_repair_total_.SetZero();
-  count_ = 0.0;
-}
-
 void LstmNonlinearityComponent::PerturbParams(BaseFloat stddev) {
   CuMatrix<BaseFloat> temp_params(params_.NumRows(), params_.NumCols());
   temp_params.SetRandn();
@@ -5352,12 +5250,14 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  if (!ok)
+	      << cfl->UnusedValues();
+  if (ok) {
+    Init(cell_dim, param_stddev, tanh_self_repair_threshold,
+         sigmoid_self_repair_threshold, self_repair_scale);
+  } else {
     KALDI_ERR << "Invalid initializer for layer of type "
               << Type() << ": \"" << cfl->WholeLine() << "\"";
-  Init(cell_dim, param_stddev, tanh_self_repair_threshold,
-       sigmoid_self_repair_threshold, self_repair_scale);
+  }
 }
 
 
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index ba7c679cb6c..62b4c9006d8 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -421,7 +421,6 @@ class AffineComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -522,7 +521,6 @@ class BlockAffineComponent : public UpdatableComponent {
   // Functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -585,7 +583,6 @@ class RepeatedAffineComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1235,7 +1232,6 @@ class PerElementScaleComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1307,7 +1303,6 @@ class PerElementOffsetComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1332,8 +1327,9 @@ class PerElementOffsetComponent: public UpdatableComponent {
 // i.e. its output does not depend on its input.  It is the same as
 // an affine component with the linear term fixed at zero.
 // It is optionally trainable, and optionally you can use natural
-// gradient.  The input is required only because the framework
-// requires components to have an input.
+// gradient.  The input is required only because it's more convenient
+// to make SimpleComponents [but see ConstantComponent, which requires
+// no inputs].
 class ConstantFunctionComponent: public UpdatableComponent {
  public:
   virtual int32 InputDim() const { return input_dim_; }
@@ -1375,7 +1371,6 @@ class ConstantFunctionComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1578,7 +1573,6 @@ class ConvolutionComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -1746,7 +1740,6 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
@@ -2002,9 +1995,9 @@ class CompositeComponent: public UpdatableComponent {
   // Some functions from base-class UpdatableComponent.
   virtual void SetUnderlyingLearningRate(BaseFloat lrate);
   virtual void SetActualLearningRate(BaseFloat lrate);
+  virtual void SetAsGradient();
   virtual void Scale(BaseFloat scale);
   virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void SetZero(bool treat_as_gradient);
   virtual void PerturbParams(BaseFloat stddev);
   virtual BaseFloat DotProduct(const UpdatableComponent &other) const;
   virtual int32 NumParameters() const;
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 37ce355c788..18131aaa213 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -104,11 +104,15 @@ void GenerateConfigSequenceSimple(
     splice_context.push_back(0);
 
   int32 input_dim = 10 + Rand() % 20,
-      spliced_dim = input_dim * splice_context.size(),
       output_dim = (opts.output_dim > 0 ?
                     opts.output_dim :
                     100 + Rand() % 200),
       hidden_dim = 40 + Rand() % 50;
+  int32 ivector_dim = 10 + Rand() % 20;
+  if (RandInt(0, 1) == 0 || !opts.allow_ivector)
+    ivector_dim = 0;
+  int32 spliced_dim = input_dim * splice_context.size() + ivector_dim;
+
   bool use_final_nonlinearity = (opts.allow_final_nonlinearity &&
                                  RandInt(0, 1) == 0);
   os << "component name=affine1 type=NaturalGradientAffineComponent input-dim="
@@ -127,8 +131,12 @@ void GenerateConfigSequenceSimple(
     }
   }
   os << "input-node name=input dim=" << input_dim << std::endl;
+  if (ivector_dim != 0)
+    os << "input-node name=ivector dim=" << ivector_dim << std::endl;
 
   os << "component-node name=affine1_node component=affine1 input=Append(";
+  if (ivector_dim != 0)
+    os << "ReplaceIndex(ivector, t, 0), ";
   for (size_t i = 0; i < splice_context.size(); i++) {
     int32 offset = splice_context[i];
     os << "Offset(input, " << offset << ")";
@@ -380,6 +388,11 @@ void GenerateConfigSequenceLstm(
 
   os << "input-node name=input dim=" << input_dim << std::endl;
 
+  // trainable cell value for start/end of file.
+  os << "component name=c0 type=ConstantComponent"
+     << " output-dim=" << cell_dim << std::endl;
+
+
   // Parameter Definitions W*(* replaced by - to have valid names)
   // Input gate control : Wi* matrices
   os << "component name=Wi-xr type=NaturalGradientAffineComponent"
@@ -459,7 +472,13 @@ void GenerateConfigSequenceLstm(
   }
   std::string spliced_input = temp_string_stream.str();
 
-  std::string c_tminus1 = "Sum(IfDefined(Offset(c1_t, -1)), IfDefined(Offset( c2_t, -1)))";
+  std::string c_tminus1 = "Sum(Failover(Offset(c1_t, -1), c0), IfDefined(Offset( c2_t, -1)))";
+
+
+  // c0.  note: the input is never used as the component requires
+  // no input indexes; we just write itself as input to keep the
+  // structures happy.
+  os << "component-node name=c0 component=c0 input=c0\n";
 
   // i_t
   os << "component-node name=i1 component=Wi-xr input=Append("
@@ -535,6 +554,7 @@ void GenerateConfigSequenceLstmWithTruncation(
   int32 clipping_threshold = RandInt(6, 50),
       zeroing_threshold = RandInt(1,  5),
       zeroing_interval = RandInt(1, 5) * 10;
+  BaseFloat scale = 0.8 + 0.1*RandInt(0,3);
 
   os << "input-node name=input dim=" << input_dim << std::endl;
 
@@ -608,12 +628,14 @@ void GenerateConfigSequenceLstmWithTruncation(
      << " output-dim=" << cell_dim << std::endl;
   os << "component name=c type=BackpropTruncationComponent dim="
      << cell_dim
+     << " scale=" << scale
      << " clipping-threshold=" << clipping_threshold
      << " zeroing-threshold=" << zeroing_threshold
      << " zeroing-interval=" << zeroing_interval
      << " recurrence-interval=1" << std::endl;
   os << "component name=r type=BackpropTruncationComponent dim="
      << projection_dim
+     << " scale=" << scale
      << " clipping-threshold=" << clipping_threshold
      << " zeroing-threshold=" << zeroing_threshold
      << " zeroing-interval=" << zeroing_interval
@@ -1019,12 +1041,16 @@ void GenerateConfigSequence(
       GenerateConfigSequenceCnn(opts, configs);
       break;
     case 8:
+      if (!opts.allow_use_of_x_dim)
+        goto start;
       GenerateConfigSequenceDistribute(opts, configs);
       break;
     case 9:
       GenerateConfigSequenceCompositeBlock(opts, configs);
       break;
     case 10:
+      if (!opts.allow_statistics_pooling)
+        goto start;
       GenerateConfigSequenceStatistics(opts, configs);
       break;
     case 11:
diff --git a/src/nnet3/nnet-test-utils.h b/src/nnet3/nnet-test-utils.h
index 18e4960f9bd..a9616281bdc 100644
--- a/src/nnet3/nnet-test-utils.h
+++ b/src/nnet3/nnet-test-utils.h
@@ -38,6 +38,9 @@ struct NnetGenerationOptions {
   bool allow_multiple_inputs;
   bool allow_multiple_outputs;
   bool allow_final_nonlinearity;
+  bool allow_use_of_x_dim;
+  bool allow_ivector;
+  bool allow_statistics_pooling;
   // if set to a value >0, the output-dim of the network
   // will be set to this value.
   int32 output_dim;
@@ -50,6 +53,9 @@ struct NnetGenerationOptions {
       allow_multiple_inputs(true),
       allow_multiple_outputs(false),
       allow_final_nonlinearity(true),
+      allow_use_of_x_dim(true),
+      allow_ivector(false),
+      allow_statistics_pooling(true),
       output_dim(-1) { }
 };
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index ef12f0c89d7..2a081920738 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -28,16 +28,14 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                          Nnet *nnet):
     config_(config),
     nnet_(nnet),
-    compiler_(*nnet, config_.optimize_config),
+    compiler_(*nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0) {
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
   KALDI_ASSERT(config.momentum >= 0.0 &&
                config.max_param_change >= 0.0);
   delta_nnet_ = nnet_->Copy();
-  bool is_gradient = false;  // setting this to true would disable the
-                             // natural-gradient updates.
-  SetZero(is_gradient, delta_nnet_);
+  ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
   num_max_change_per_component_applied_.resize(num_updatable, 0);
   num_max_change_global_applied_ = 0;
@@ -68,10 +66,10 @@ void NnetTrainer::Train(const NnetExample &eg) {
                         *nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.io);
-  computer.Forward();
+  computer.Run();
 
   this->ProcessOutputs(eg, &computer);
-  computer.Backward();
+  computer.Run();
 
   UpdateParamsWithMaxChange();
 }
@@ -150,7 +148,7 @@ void NnetTrainer::UpdateParamsWithMaxChange() {
     if (param_delta > config_.max_param_change) {
       if (param_delta - param_delta != 0.0) {
         KALDI_WARN << "Infinite parameter change, will not apply.";
-        SetZero(false, delta_nnet_);
+        ScaleNnet(0.0, delta_nnet_);
       } else {
         scale *= config_.max_param_change / param_delta;
         num_max_change_global_applied_++;
@@ -182,7 +180,7 @@ void NnetTrainer::UpdateParamsWithMaxChange() {
 }
 
 bool NnetTrainer::PrintTotalStats() const {
-  unordered_map<std::string, ObjectiveFunctionInfo>::const_iterator
+  unordered_map<std::string, ObjectiveFunctionInfo, StringHasher>::const_iterator
       iter = objf_info_.begin(),
       end = objf_info_.end();
   bool ans = false;
@@ -324,7 +322,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
             CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols(),
                                              kUndefined);
             cu_post.CopyToMat(&output_deriv);
-            computer->AcceptOutputDeriv(output_name, &output_deriv);
+            computer->AcceptInput(output_name, &output_deriv);
           }
           break;
         }
@@ -335,7 +333,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
           *tot_weight = cu_post.Sum();
           *tot_objf = TraceMatMat(output, cu_post, kTrans);
           if (supply_deriv)
-            computer->AcceptOutputDeriv(output_name, &cu_post);
+            computer->AcceptInput(output_name, &cu_post);
           break;
         }
         case kCompressedMatrix: {
@@ -346,7 +344,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
           *tot_weight = cu_post.Sum();
           *tot_objf = TraceMatMat(output, cu_post, kTrans);
           if (supply_deriv)
-            computer->AcceptOutputDeriv(output_name, &cu_post);
+            computer->AcceptInput(output_name, &cu_post);
           break;
         }
       }
@@ -362,7 +360,7 @@ void ComputeObjectiveFunction(const GeneralMatrix &supervision,
       *tot_weight = diff.NumRows();
       *tot_objf = -0.5 * TraceMatMat(diff, diff, kTrans);
       if (supply_deriv)
-        computer->AcceptOutputDeriv(output_name, &diff);
+        computer->AcceptInput(output_name, &diff);
       break;
     }
     default:
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index 70c90267c66..55d3e02ea67 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -42,6 +42,7 @@ struct NnetTrainerOptions {
   BaseFloat max_param_change;
   NnetOptimizeOptions optimize_config;
   NnetComputeOptions compute_config;
+  CachingOptimizingCompilerOptions compiler_config;
   NnetTrainerOptions():
       zero_component_stats(true),
       store_component_stats(true),
@@ -79,8 +80,8 @@ struct NnetTrainerOptions {
     // register the optimization options with the prefix "optimization".
     ParseOptions optimization_opts("optimization", opts);
     optimize_config.Register(&optimization_opts);
-
-
+    ParseOptions compiler_opts("compiler", opts);
+    compiler_config.Register(&compiler_opts);
     // register the compute options with the prefix "computation".
     ParseOptions compute_opts("computation", opts);
     compute_config.Register(&compute_opts);
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 3d4330ac9f3..865fdcd7c0a 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -68,8 +68,8 @@ void EvaluateComputationRequest(
     const ComputationRequest &request,
     std::vector<std::vector<bool> > *is_computable) {
   ComputationGraph graph;
-  ComputationGraphBuilder builder(nnet, request, &graph);
-  builder.Compute();
+  ComputationGraphBuilder builder(nnet, &graph);
+  builder.Compute(request);
   builder.GetComputableInfo(is_computable);
   if (GetVerboseLevel() >= 4) {
     std::ostringstream graph_pretty;
@@ -103,9 +103,16 @@ static void ComputeSimpleNnetContextForShift(
     input.indexes.push_back(Index(n, t));
     output.indexes.push_back(Index(n, t));
   }
-  // the assumption here is that the network just requires the ivector at time
-  // t=0.
-  ivector.indexes.push_back(Index(n, 0));
+
+  // most networks will just require the ivector at time t = 0,
+  // but this might not always be the case, and some might use rounding
+  // descriptors with the iVector which might require it at an earlier
+  // frame than the regular input, so we provide the iVector in as wide a range
+  // as it might possibly be needed.
+  for (int32 t = input_start - nnet.Modulus(); t < input_end; t++) {
+    ivector.indexes.push_back(Index(n, t));
+  }
+
 
   ComputationRequest request;
   request.inputs.push_back(input);
@@ -161,18 +168,6 @@ void ComputeSimpleNnetContext(const Nnet &nnet,
       *std::max_element(right_contexts.begin(), right_contexts.end());
 }
 
-void SetZero(bool is_gradient,
-             Nnet *nnet) {
-  for (int32 c = 0; c < nnet->NumComponents(); c++) {
-    Component *comp = nnet->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *u_comp = dynamic_cast<UpdatableComponent*>(comp);
-      KALDI_ASSERT(u_comp != NULL);
-      u_comp->SetZero(is_gradient);
-    }
-  }
-}
-
 void PerturbParams(BaseFloat stddev,
                    Nnet *nnet) {
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
@@ -266,11 +261,20 @@ void SetLearningRate(BaseFloat learning_rate,
   }
 }
 
+void SetNnetAsGradient(Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *comp = nnet->GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      UpdatableComponent *u_comp = dynamic_cast<UpdatableComponent*>(comp);
+      KALDI_ASSERT(u_comp != NULL);
+      u_comp->SetAsGradient();
+    }
+  }
+}
+
 void ScaleNnet(BaseFloat scale, Nnet *nnet) {
   if (scale == 1.0) return;
-  else if (scale == 0.0) {
-    SetZero(false, nnet);
-  } else {
+  else {
     for (int32 c = 0; c < nnet->NumComponents(); c++) {
       Component *comp = nnet->GetComponent(c);
       comp->Scale(scale);
@@ -615,31 +619,6 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
       if (outputs_remaining == 0)
         KALDI_ERR << "All outputs were removed.";
       nnet->RemoveSomeNodes(nodes_to_remove);
-    } else if (directive == "set-dropout-proportion") {
-      std::string name_pattern = "*";
-      // name_pattern defaults to '*' if none is given.  This pattern
-      // matches names of components, not nodes.
-      config_line.GetValue("name", &name_pattern);
-      BaseFloat proportion = -1;
-      if (!config_line.GetValue("proportion", &proportion)) {
-        KALDI_ERR << "In edits-config, expected proportion to be set in line: "
-                  << config_line.WholeLine();
-      }
-      DropoutComponent *dropout_component = NULL;
-      int32 num_dropout_proportions_set = 0;
-      for (int32 c = 0; c < nnet->NumComponents(); c++) {
-        if (NameMatchesPattern(nnet->GetComponentName(c).c_str(),
-                               name_pattern.c_str()) &&
-            (dropout_component =
-             dynamic_cast<DropoutComponent*>(nnet->GetComponent(c)))) {
-          if (dropout_component != NULL) {
-            dropout_component->SetDropoutProportion(proportion);
-            num_dropout_proportions_set++;
-          }
-        }
-      }
-      KALDI_LOG << "Set dropout proportions for "
-                << num_dropout_proportions_set << " components.";
     } else {
       KALDI_ERR << "Directive '" << directive << "' is not currently "
           "supported (reading edit-config).";
@@ -652,6 +631,14 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
 }
 
 
+/// Returns true if 'nnet' has some kind of recurrency.
+bool NnetIsRecurrent(const Nnet &nnet) {
+  std::vector<std::vector<int32> > graph;
+  NnetToDirectedGraph(nnet, &graph);
+  return GraphHasCycles(graph);
+}
+
+
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 0b5ab3c1fd4..921f1f1901d 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -53,11 +53,6 @@ int32 NumOutputNodes(const Nnet &nnet);
 /// returns the number of input nodes of this nnet.
 int32 NumInputNodes(const Nnet &nnet);
 
-/// Calls SetZero (with the given is_gradient parameter) on all updatable
-/// components of the nnet.
-void SetZero(bool is_gradient,
-             Nnet *nnet);
-
 /// Calls PerturbParams (with the given stddev) on all updatable components of
 /// the nnet.
 void PerturbParams(BaseFloat stddev,
@@ -85,8 +80,8 @@ std::string PrintVectorPerUpdatableComponent(const Nnet &nnet,
                                              const VectorBase<BaseFloat> &vec);
 
 /// This function returns true if the nnet has the following properties:
-///  It has an called "output" (other outputs are allowed but may be
-///          ignored).
+///  It has an output called "output" (other outputs are allowed but may be
+///  ignored).
 ///  It has an input called "input", and possibly an extra input called
 ///    "ivector", but no other inputs.
 ///  There are probably some other properties that we really ought to
@@ -97,7 +92,7 @@ bool IsSimpleNnet(const Nnet &nnet);
 void ZeroComponentStats(Nnet *nnet);
 
 
-/// ComputeNnetContext computes the left-context and right-context of a nnet.
+/// ComputeSimpleNnetContext computes the left-context and right-context of a nnet.
 /// The nnet must satisfy IsSimpleNnet(nnet).
 ///
 /// It does this by constructing a ComputationRequest with a certain number of inputs
@@ -119,6 +114,10 @@ void SetLearningRate(BaseFloat learning_rate,
 /// Scales the nnet parameters and stats by this scale.
 void ScaleNnet(BaseFloat scale, Nnet *nnet);
 
+/// Sets nnet as gradient by Setting is_gradient_ to true and
+/// learning_rate_ to 1 for each UpdatableComponent in nnet
+void SetNnetAsGradient(Nnet *nnet);
+
 /// Does *dest += alpha * src (affects nnet parameters and
 /// stored stats).
 void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
@@ -129,6 +128,9 @@ void AddNnet(const Nnet &src, BaseFloat alpha, Nnet *dest);
 void AddNnetComponents(const Nnet &src, const Vector<BaseFloat> &alphas,
                        BaseFloat scale, Nnet *dest);
 
+/// Returns true if 'nnet' has some kind of recurrency.
+bool NnetIsRecurrent(const Nnet &nnet);
+
 /// Returns the total of the number of parameters in the updatable components of
 /// the nnet.
 int32 NumParameters(const Nnet &src);
@@ -158,8 +160,8 @@ void ConvertRepeatedToBlockAffine(Nnet *nnet);
 /// Info() function (we need this in the CTC code).
 std::string NnetInfo(const Nnet &nnet);
 
-/// This function sets the dropout proportion in all dropout component to 
-/// dropout_proportion value.
+/// This function sets the dropout proportion in all dropout components to
+/// the value 'dropout_proportion'
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 /// This function finds a list of components that are never used, and outputs
@@ -177,7 +179,9 @@ void FindOrphanNodes(const Nnet &nnet, std::vector<int32> *nodes);
    ReadEditConfig() reads a file with a similar-looking format to the config file
    read by Nnet::ReadConfig(), but this consists of a sequence of operations to
    perform on an existing network, mostly modifying components.  It's one
-   "directive" (i.e. command) per line.
+   "directive" (i.e. command) per line, but if supplying the options via
+   the --edits option to programs like nnet3-am-copy, you can use a semicolon
+   in place of the newline to separate commands.
 
    The following describes the allowed commands.  Note: all patterns are like
    UNIX globbing patterns where the only metacharacter is '*', representing zero
diff --git a/src/nnet3/online-nnet3-decodable-simple.cc b/src/nnet3/online-nnet3-decodable-simple.cc
deleted file mode 100644
index c93394dfebd..00000000000
--- a/src/nnet3/online-nnet3-decodable-simple.cc
+++ /dev/null
@@ -1,221 +0,0 @@
-// nnet3/online-nnet3-decodable.cc
-
-// Copyright  2014  Johns Hopkins University (author: Daniel Povey)
-//            2016  Api.ai (Author: Ilya Platonov)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <nnet3/online-nnet3-decodable-simple.h>
-#include "nnet3/nnet-utils.h"
-
-namespace kaldi {
-namespace nnet3 {
-
-DecodableNnet3SimpleOnline::DecodableNnet3SimpleOnline(
-    const AmNnetSimple &am_nnet,
-    const TransitionModel &trans_model,
-    const DecodableNnet3OnlineOptions &opts,
-    OnlineFeatureInterface *input_feats):
-    compiler_(am_nnet.GetNnet(), opts.optimize_config),
-    features_(input_feats),
-    am_nnet_(am_nnet),
-    trans_model_(trans_model),
-    opts_(opts),
-    feat_dim_(input_feats->Dim()),
-    num_pdfs_(am_nnet.GetNnet().OutputDim("output")),
-    begin_frame_(-1) {
-  KALDI_ASSERT(opts_.max_nnet_batch_size > 0);
-  log_priors_ = am_nnet_.Priors();
-  KALDI_ASSERT((log_priors_.Dim() == 0 || log_priors_.Dim() == trans_model_.NumPdfs()) &&
-               "Priors in neural network must match with transition model (if exist).");
-
-  ComputeSimpleNnetContext(am_nnet_.GetNnet(), &left_context_, &right_context_);
-  log_priors_.ApplyLog();
-
-  // Check that the dimensions are correct.
-  int32 input_dim = am_nnet_.GetNnet().InputDim("input");
-  int32 ivector_dim = std::max<int32>(0, am_nnet_.GetNnet().InputDim("ivector"));
-  // We use feature extraction code that was designed for nnet2, which just
-  // appends the mfcc and ivector features.  So here we have to separate them
-  // again.  This code just checks that the dimension is as we expect.
-  int32 feature_dim = features_->Dim();
-  if (feature_dim != input_dim + ivector_dim) {
-    KALDI_ERR << "Dimension of features " << feature_dim << " does not equal "
-              << "input dim " << input_dim << " + ivector dim " << ivector_dim
-              << " of neural network.  Likely the config and neural net "
-              << "mismatch.";
-  }
-}
-
-
-
-BaseFloat DecodableNnet3SimpleOnline::LogLikelihood(int32 frame, int32 index) {
-  ComputeForFrame(frame);
-  int32 pdf_id = trans_model_.TransitionIdToPdf(index);
-  KALDI_ASSERT(frame >= begin_frame_ &&
-               frame < begin_frame_ + scaled_loglikes_.NumRows());
-  return scaled_loglikes_(frame - begin_frame_, pdf_id);
-}
-
-
-bool DecodableNnet3SimpleOnline::IsLastFrame(int32 frame) const {
-  KALDI_ASSERT(false && "Method is not imlemented");
-  return false;
-}
-
-int32 DecodableNnet3SimpleOnline::NumFramesReady() const {
-  int32 features_ready = features_->NumFramesReady();
-  if (features_ready == 0)
-    return 0;
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  if (opts_.pad_input) {
-    // normal case... we'll pad with duplicates of first + last frame to get the
-    // required left and right context.
-    if (input_finished) return NumSubsampledFrames(features_ready);
-    else return std::max<int32>(0, NumSubsampledFrames(features_ready - right_context_));
-  } else {
-    return std::max<int32>(0, NumSubsampledFrames(features_ready - right_context_ - left_context_));
-  }
-}
-
-int32 DecodableNnet3SimpleOnline::NumSubsampledFrames(int32 num_frames) const {
-  return (num_frames) / opts_.frame_subsampling_factor;
-}
-
-void DecodableNnet3SimpleOnline::ComputeForFrame(int32 subsampled_frame) {
-  int32 features_ready = features_->NumFramesReady();
-  bool input_finished = features_->IsLastFrame(features_ready - 1);
-  KALDI_ASSERT(subsampled_frame >= 0);
-  if (subsampled_frame >= begin_frame_ &&
-      subsampled_frame < begin_frame_ + scaled_loglikes_.NumRows())
-    return;
-  KALDI_ASSERT(subsampled_frame < NumFramesReady());
-
-  int32 subsample = opts_.frame_subsampling_factor;
-
-  int32 input_frame_begin;
-  if (opts_.pad_input)
-    input_frame_begin = subsampled_frame * subsample  - left_context_;
-  else
-    input_frame_begin = subsampled_frame * subsample;
-  int32 max_possible_input_frame_end = features_ready;
-  if (input_finished && opts_.pad_input)
-    max_possible_input_frame_end += right_context_;
-  int32 input_frame_end = std::min<int32>(max_possible_input_frame_end,
-                                          input_frame_begin +
-                                          left_context_ + right_context_ +
-                                          opts_.max_nnet_batch_size);
-  KALDI_ASSERT(input_frame_end > input_frame_begin);
-  Matrix<BaseFloat> features(input_frame_end - input_frame_begin,
-                             feat_dim_);
-  for (int32 t = input_frame_begin; t < input_frame_end; t++) {
-    SubVector<BaseFloat> row(features, t - input_frame_begin);
-    int32 t_modified = t;
-    // The next two if-statements take care of "pad_input"
-    if (t_modified < 0)
-      t_modified = 0;
-    if (t_modified >= features_ready)
-      t_modified = features_ready - 1;
-    features_->GetFrame(t_modified, &row);
-  }
-
-  int32 num_subsampled_frames = NumSubsampledFrames(input_frame_end - input_frame_begin -
-          left_context_ - right_context_);
-  int32 mfcc_dim = am_nnet_.GetNnet().InputDim("input");
-  int32 ivector_dim = am_nnet_.GetNnet().InputDim("ivector");
-  // MFCCs in the left chunk
-  SubMatrix<BaseFloat> mfcc_mat(features.ColRange(0, mfcc_dim));
-
-  Vector<BaseFloat> input_ivector;
-  if(ivector_dim != -1){
-    // iVectors in the right chunk
-    KALDI_ASSERT(features.NumCols() == mfcc_dim + ivector_dim && "Mismatch in features dim");
-    SubMatrix<BaseFloat> ivector_mat(features.ColRange(mfcc_dim, ivector_dim));
-    // Get last ivector... not sure if GetCurrentIvector is needed in the online context
-    // I think it should work fine just getting the last row for testing
-    input_ivector = ivector_mat.Row(ivector_mat.NumRows() - 1);
-  }
-
-  DoNnetComputation(input_frame_begin,
-    mfcc_mat, input_ivector, subsampled_frame * subsample, num_subsampled_frames);
-
-  begin_frame_ = subsampled_frame;
-}
-
-void DecodableNnet3SimpleOnline::DoNnetComputation(
-    int32 input_t_start,
-    const MatrixBase<BaseFloat> &input_feats,
-    const VectorBase<BaseFloat> &ivector,
-    int32 output_t_start,
-    int32 num_subsampled_frames) {
-  ComputationRequest request;
-  request.need_model_derivative = false;
-  request.store_component_stats = false;
-
-  bool shift_time = true; // shift the 'input' and 'output' to a consistent
-                          // time, to take advantage of caching in the compiler.
-                          // An optimization.
-  int32 time_offset = (shift_time ? -output_t_start : 0);
-
-  // First add the regular features-- named "input".
-  request.inputs.reserve(2);
-  request.inputs.push_back(
-      IoSpecification("input", time_offset + input_t_start,
-                      time_offset + input_t_start + input_feats.NumRows()));
-  if (ivector.Dim() != 0) {
-    std::vector<Index> indexes;
-    indexes.push_back(Index(0, 0, 0));
-    request.inputs.push_back(IoSpecification("ivector", indexes));
-  }
-  IoSpecification output_spec;
-  output_spec.name = "output";
-  output_spec.has_deriv = false;
-  int32 subsample = opts_.frame_subsampling_factor;
-  output_spec.indexes.resize(num_subsampled_frames);
-  // leave n and x values at 0 (the constructor sets these).
-  for (int32 i = 0; i < num_subsampled_frames; i++)
-    output_spec.indexes[i].t = time_offset + output_t_start + i * subsample;
-  request.outputs.resize(1);
-  request.outputs[0].Swap(&output_spec);
-
-  const NnetComputation *computation = compiler_.Compile(request);
-  Nnet *nnet_to_update = NULL;  // we're not doing any update.
-  NnetComputer computer(opts_.compute_config, *computation,
-                        am_nnet_.GetNnet(), nnet_to_update);
-
-  CuMatrix<BaseFloat> input_feats_cu(input_feats);
-  computer.AcceptInput("input", &input_feats_cu);
-  CuMatrix<BaseFloat> ivector_feats_cu;
-  if (ivector.Dim() > 0) {
-    ivector_feats_cu.Resize(1, ivector.Dim());
-    ivector_feats_cu.Row(0).CopyFromVec(ivector);
-    computer.AcceptInput("ivector", &ivector_feats_cu);
-  }
-  computer.Forward();
-  CuMatrix<BaseFloat> cu_output;
-  computer.GetOutputDestructive("output", &cu_output);
-  // subtract log-prior (divide by prior)
-  if (log_priors_.Dim() != 0)
-    cu_output.AddVecToRows(-1.0, log_priors_);
-  // apply the acoustic scale
-  cu_output.Scale(opts_.acoustic_scale);
-  scaled_loglikes_.Resize(0, 0);
-  // the following statement just swaps the pointers if we're not using a GPU.
-  cu_output.Swap(&scaled_loglikes_);
-}
-
-} // namespace nnet3
-} // namespace kaldi
diff --git a/src/nnet3/online-nnet3-decodable-simple.h b/src/nnet3/online-nnet3-decodable-simple.h
deleted file mode 100644
index af7c18da64b..00000000000
--- a/src/nnet3/online-nnet3-decodable-simple.h
+++ /dev/null
@@ -1,153 +0,0 @@
-// nnet3/online-nnet3-decodable-simple.h
-
-// Copyright  2014  Johns Hopkins Universithy (author: Daniel Povey)
-//            2016  Api.ai (Author: Ilya Platonov)
-
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
-#define KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
-
-#include "itf/online-feature-itf.h"
-#include "itf/decodable-itf.h"
-#include "nnet3/am-nnet-simple.h"
-#include "nnet3/nnet-compute.h"
-#include "nnet3/nnet-optimize.h"
-#include "hmm/transition-model.h"
-
-namespace kaldi {
-namespace nnet3 {
-
-// Note: see also nnet-compute-online.h, which provides a different
-// (lower-level) interface and more efficient for progressive evaluation of an
-// nnet throughout an utterance, with re-use of already-computed activations.
-
-struct DecodableNnet3OnlineOptions {
-  int32 frame_subsampling_factor;
-  BaseFloat acoustic_scale;
-  bool pad_input;
-  int32 max_nnet_batch_size;
-  NnetComputeOptions compute_config;
-  NnetOptimizeOptions optimize_config;
-
-  DecodableNnet3OnlineOptions():
-      frame_subsampling_factor(1),
-      acoustic_scale(0.1),
-      pad_input(true),
-      max_nnet_batch_size(256) { }
-
-  void Register(OptionsItf *opts) {
-    opts->Register("acoustic-scale", &acoustic_scale,
-                   "Scaling factor for acoustic likelihoods");
-    opts->Register("pad-input", &pad_input,
-                   "If true, pad acoustic features with required acoustic context "
-                   "past edges of file.");
-    opts->Register("max-nnet-batch-size", &max_nnet_batch_size,
-                   "Maximum batch size we use in neural-network decodable object, "
-                   "in cases where we are not constrained by currently available "
-                   "frames (this will rarely make a difference)");
-
-    opts->Register("frame-subsampling-factor", &frame_subsampling_factor,
-                   "Required if the frame-rate of the output (e.g. in 'chain' "
-                   "models) is less than the frame-rate of the original "
-                   "alignment.");
-
-    // register the optimization options with the prefix "optimization".
-    ParseOptions optimization_opts("optimization", opts);
-    optimize_config.Register(&optimization_opts);
-
-    // register the compute options with the prefix "computation".
-    ParseOptions compute_opts("computation", opts);
-    compute_config.Register(&compute_opts);
-
-  }
-};
-
-
-/**
-   This Decodable object for class nnet3::AmNnetSimple takes feature input from class
-   OnlineFeatureInterface, unlike, say, class DecodableAmNnet which takes
-   feature input from a matrix.
-*/
-
-class DecodableNnet3SimpleOnline: public DecodableInterface {
- public:
-  DecodableNnet3SimpleOnline(const AmNnetSimple &am_nnet,
-                             const TransitionModel &trans_model,
-                             const DecodableNnet3OnlineOptions &opts,
-                             OnlineFeatureInterface *input_feats);
-
-
-  /// Returns the scaled log likelihood
-  virtual BaseFloat LogLikelihood(int32 frame, int32 index);
-
-  virtual bool IsLastFrame(int32 frame) const;
-
-  virtual int32 NumFramesReady() const;
-
-  /// Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-
-  int32 FrameSubsamplingFactor() const { return opts_.frame_subsampling_factor; }
- private:
-
-  /// If the neural-network outputs for this frame are not cached, it computes
-  /// them (and possibly for some succeeding frames)
-  void ComputeForFrame(int32 frame);
-  // corrects number of frames by frame_subsampling_factor;
-  int32 NumSubsampledFrames(int32) const;
-
-  void DoNnetComputation(
-      int32 input_t_start,
-      const MatrixBase<BaseFloat> &input_feats,
-      const VectorBase<BaseFloat> &ivector,
-      int32 output_t_start,
-      int32 num_subsampled_frames);
-
-  CachingOptimizingCompiler compiler_;
-
-  OnlineFeatureInterface *features_;
-  const AmNnetSimple &am_nnet_;
-  const TransitionModel &trans_model_;
-  DecodableNnet3OnlineOptions opts_;
-  CuVector<BaseFloat> log_priors_;  // log-priors taken from the model.
-  int32 feat_dim_;  // dimensionality of the input features.
-  int32 left_context_;  // Left context of the network (cached here)
-  int32 right_context_;  // Right context of the network (cached here)
-  int32 num_pdfs_;  // Number of pdfs, equals output-dim of the network (cached
-                    // here)
-
-  int32 begin_frame_;  // First frame for which scaled_loglikes_ is valid
-                       // (i.e. the first frame of the batch of frames for
-                       // which we've computed the output).
-
-  // scaled_loglikes_ contains the neural network pseudo-likelihoods: the log of
-  // (prob divided by the prior), scaled by opts.acoustic_scale).  We may
-  // compute this using the GPU, but we transfer it back to the system memory
-  // when we store it here.  These scores are only kept for a subset of frames,
-  // starting at begin_frame_, whose length depends how many frames were ready
-  // at the time we called LogLikelihood(), and will never exceed
-  // opts_.max_nnet_batch_size.
-  Matrix<BaseFloat> scaled_loglikes_;
-
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableNnet3SimpleOnline);
-};
-
-} // namespace nnet3
-} // namespace kaldi
-
-#endif // KALDI_NNET3_ONLINE_NNET3_DECODABLE_H_
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index d46c56a1044..2bae1dcdc43 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -16,8 +16,8 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-get-egs nnet3-discriminative-copy-egs \
    nnet3-discriminative-merge-egs nnet3-discriminative-shuffle-egs \
    nnet3-discriminative-compute-objf nnet3-discriminative-train \
-   discriminative-get-supervision nnet3-discriminative-subset-egs \
-   nnet3-discriminative-compute-from-egs
+   nnet3-discriminative-subset-egs \
+   nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped
 
 OBJFILES =
 
diff --git a/src/nnet3bin/discriminative-get-supervision.cc b/src/nnet3bin/discriminative-get-supervision.cc
deleted file mode 100644
index 32d66c1c55a..00000000000
--- a/src/nnet3bin/discriminative-get-supervision.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// nnet3bin/discriminative-get-supervision.cc
-
-// Copyright      2015  Johns Hopkins University (author:  Daniel Povey)
-// Copyright 2014-2015  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sstream>
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet3/discriminative-supervision.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::discriminative;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Get a discriminative training supervision object for each file of training data.\n"
-        "This will normally be piped into nnet3-discriminative-get-egs, where it\n"
-        "will be split up into pieces and combined with the features.\n"
-        "Usage: discriminative-get-supervision [options] <ali-rspecifier> \\\n" 
-        "<den-lattice-rspecifier> <supervision-wspecifier>\n";
-
-    DiscriminativeSupervisionOptions sup_opts;
-
-    ParseOptions po(usage);
-
-    sup_opts.Register(&po);
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string num_ali_rspecifier = po.GetArg(1),
-                den_lat_rspecifier = po.GetArg(2),
-                supervision_wspecifier = po.GetArg(3);
-
-    DiscriminativeSupervisionWriter supervision_writer(supervision_wspecifier);
-    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
-    SequentialInt32VectorReader ali_reader(num_ali_rspecifier);
-
-    int32 num_utts_done = 0, num_utts_error = 0;
-
-    for (; !ali_reader.Done(); ali_reader.Next())  {
-      const std::string &key = ali_reader.Key();
-      const std::vector<int32> &num_ali = ali_reader.Value();
-      
-      if (!den_lat_reader.HasKey(key)) {
-        KALDI_WARN << "Could not find denominator lattice for utterance "
-                   << key;
-        num_utts_error++;
-        continue;
-      }
-
-      const Lattice &den_lat = den_lat_reader.Value(key);
-
-      DiscriminativeSupervision supervision;
-
-      if (!supervision.Initialize(num_ali, den_lat, 1.0)) {
-        KALDI_WARN << "Failed to convert lattice to supervision "
-          << "for utterance " << key;
-        num_utts_error++;
-        continue;
-      }
-
-      supervision_writer.Write(key, supervision);
-      
-      num_utts_done++;
-    } 
-    
-    KALDI_LOG << "Generated discriminative supervision information for "
-              << num_utts_done << " utterances, errors on "
-              << num_utts_error;
-    return (num_utts_done > num_utts_error ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
diff --git a/src/nnet3bin/nnet3-acc-lda-stats.cc b/src/nnet3bin/nnet3-acc-lda-stats.cc
index 0b3b537855e..c8911a4a39f 100644
--- a/src/nnet3bin/nnet3-acc-lda-stats.cc
+++ b/src/nnet3bin/nnet3-acc-lda-stats.cc
@@ -46,7 +46,7 @@ class NnetLdaStatsAccumulator {
     NnetComputer computer(options, computation, nnet_, NULL);
 
     computer.AcceptInputs(nnet_, eg.io);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     AccStatsFromOutput(eg, nnet_output);
   }
diff --git a/src/nnet3bin/nnet3-align-compiled.cc b/src/nnet3bin/nnet3-align-compiled.cc
index 790c0938fdf..84a5f38b4ee 100644
--- a/src/nnet3bin/nnet3-align-compiled.cc
+++ b/src/nnet3bin/nnet3-align-compiled.cc
@@ -47,8 +47,8 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         " nnet3-align-compiled 1.mdl ark:graphs.fsts scp:train.scp ark:1.ali\n"
         "or:\n"
-        " compile-train-graphs tree 1.mdl lex.fst ark:train.tra b, ark:- | \\\n"
-        "   nnet3-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
+        " compile-train-graphs tree 1.mdl lex.fst 'ark:sym2int.pl -f 2- words.txt text|' \\\n"
+        "   ark:- | nnet3-align-compiled 1.mdl ark:- scp:train.scp t, ark:1.ali\n";
 
     ParseOptions po(usage);
     AlignConfig align_config;
@@ -63,7 +63,7 @@ int main(int argc, char *argv[]) {
     int32 online_ivector_period = 0;
     align_config.Register(&po);
     decodable_opts.Register(&po);
-    
+
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("transition-scale", &transition_scale,
@@ -101,6 +101,7 @@ int main(int argc, char *argv[]) {
     double tot_like = 0.0;
     kaldi::int64 frame_count = 0;
 
+
     {
       TransitionModel trans_model;
       AmNnetSimple am_nnet;
@@ -110,6 +111,10 @@ int main(int argc, char *argv[]) {
         trans_model.Read(ki.Stream(), binary);
         am_nnet.Read(ki.Stream(), binary);
       }
+      // this compiler object allows caching of computations across
+      // different utterances.
+      CachingOptimizingCompiler compiler(am_nnet.GetNnet(),
+                                         decodable_opts.optimize_config);
 
       RandomAccessBaseFloatMatrixReader online_ivector_reader(
           online_ivector_rspecifier);
@@ -173,7 +178,7 @@ int main(int argc, char *argv[]) {
         DecodableAmNnetSimple nnet_decodable(
             decodable_opts, trans_model, am_nnet,
             features, ivector, online_ivectors,
-            online_ivector_period);
+            online_ivector_period, &compiler);
 
         AlignUtteranceWrapper(align_config, utt,
                               decodable_opts.acoustic_scale,
@@ -199,5 +204,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 7aa0e4a32c0..5f697356dbf 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -47,7 +47,6 @@ int main(int argc, char *argv[]) {
     bool binary_write = true,
         raw = false;
     BaseFloat learning_rate = -1;
-    BaseFloat learning_rate_scale = 1;
     std::string set_raw_nnet = "";
     bool convert_repeated_to_block = false;
     BaseFloat scale = 1.0;
diff --git a/src/nnet3bin/nnet3-average.cc b/src/nnet3bin/nnet3-average.cc
index c82e3b93323..9d4513775d6 100644
--- a/src/nnet3bin/nnet3-average.cc
+++ b/src/nnet3bin/nnet3-average.cc
@@ -73,10 +73,10 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    string weights_str;
+    std::string weights_str;
     po.Register("weights", &weights_str, "Colon-separated list of weights, one "
                 "for each input model.  These will be normalized to sum to one.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() < 2) {
@@ -90,23 +90,23 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     ReadKaldiObject(first_nnet_rxfilename, &nnet);
-    
+
     int32 num_inputs = po.NumArgs() - 1;
 
     std::vector<BaseFloat> model_weights;
     GetWeights(weights_str, num_inputs, &model_weights);
-    
+
     ScaleNnet(model_weights[0], &nnet);
-              
+
     for (int32 i = 2; i <= num_inputs; i++) {
       Nnet src_nnet;
       ReadKaldiObject(po.GetArg(i), &src_nnet);
       AddNnet(src_nnet, model_weights[i - 1], &nnet);
     }
-    
+
 
     WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
-    
+
     KALDI_LOG << "Averaged parameters of " << num_inputs
               << " neural nets, and wrote to " << nnet_wxfilename;
     return 0; // it will throw an exception if there are any problems.
@@ -115,4 +115,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-compute-from-egs.cc b/src/nnet3bin/nnet3-compute-from-egs.cc
index 66eace0dab5..648b5e1408f 100644
--- a/src/nnet3bin/nnet3-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-compute-from-egs.cc
@@ -46,7 +46,7 @@ class NnetComputerFromEg {
       options.debug = true;
     NnetComputer computer(options, computation, nnet_, NULL);
     computer.AcceptInputs(nnet_, eg.io);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
     nnet_output.CopyToMat(output);
@@ -54,7 +54,7 @@ class NnetComputerFromEg {
  private:
   const Nnet &nnet_;
   CachingOptimizingCompiler compiler_;
-  
+
 };
 
 }
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         "nnet3-compute-from-egs --apply-exp=true 0.raw ark:1.egs ark:- | matrix-sum-rows ark:- ... \n"
         "See also: nnet3-compute\n";
-    
+
     bool binary_write = true,
         apply_exp = false;
     std::string use_gpu = "yes";
@@ -89,7 +89,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
-    
+
     std::string nnet_rxfilename = po.GetArg(1),
         examples_rspecifier = po.GetArg(2),
         matrix_wspecifier = po.GetArg(3);
@@ -109,10 +109,10 @@ int main(int argc, char *argv[]) {
     NnetComputerFromEg computer(nnet);
 
     int64 num_egs = 0;
-    
+
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
-    
+
     for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
       Matrix<BaseFloat> output;
       computer.Compute(example_reader.Value(), &output);
@@ -131,5 +131,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc
index efb51f51910..42413114af3 100644
--- a/src/nnet3bin/nnet3-copy-egs.cc
+++ b/src/nnet3bin/nnet3-copy-egs.cc
@@ -191,6 +191,7 @@ bool SelectFromExample(const NnetExample &eg,
                        int32 right_context,
                        int32 frame_shift,
                        NnetExample *eg_out) {
+  static bool warned_left = false, warned_right = false;
   int32 min_input_t, max_input_t,
       min_output_t, max_output_t;
   if (!ContainsSingleExample(eg, &min_input_t, &max_input_t,
@@ -214,21 +215,26 @@ bool SelectFromExample(const NnetExample &eg,
       min_output_t = max_output_t = frame;
     }
   }
-  // There may come a time when we want to remove or make it possible to disable
-  // the error messages below.  The std::max and std::min expressions may seem
-  // unnecessary but are intended to make life easier if and when we do that.
   if (left_context != -1) {
-    if (min_input_t > min_output_t - left_context)
-      KALDI_ERR << "You requested --left-context=" << left_context
-                << ", but example only has left-context of "
-                <<  (min_output_t - min_input_t);
+    if (!warned_left && min_input_t > min_output_t - left_context) {
+      warned_left = true;
+      KALDI_WARN << "You requested --left-context=" << left_context
+                 << ", but example only has left-context of "
+                 <<  (min_output_t - min_input_t)
+                 << " (will warn only once; this may be harmless if "
+          "using any --*left-context-initial options)";
+    }
     min_input_t = std::max(min_input_t, min_output_t - left_context);
   }
   if (right_context != -1) {
-    if (max_input_t < max_output_t + right_context)
-      KALDI_ERR << "You requested --right-context=" << right_context
+    if (!warned_right && max_input_t < max_output_t + right_context) {
+      warned_right = true;
+      KALDI_WARN << "You requested --right-context=" << right_context
                 << ", but example only has right-context of "
-                <<  (max_input_t - max_output_t);
+                <<  (max_input_t - max_output_t)
+                 << " (will warn only once; this may be harmless if "
+            "using any --*right-context-final options.";
+    }
     max_input_t = std::min(max_input_t, max_output_t + right_context);
   }
   FilterExample(eg,
@@ -357,5 +363,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
index 7736290d1d5..d8b0f469beb 100644
--- a/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-compute-from-egs.cc
@@ -46,7 +46,7 @@ class NnetComputerFromEg {
       options.debug = true;
     NnetComputer computer(options, computation, nnet_, NULL);
     computer.AcceptInputs(nnet_, eg.io);
-    computer.Forward();
+    computer.Run();
     const CuMatrixBase<BaseFloat> &nnet_output = computer.GetOutput("output");
     output->Resize(nnet_output.NumRows(), nnet_output.NumCols());
     nnet_output.CopyToMat(output);
@@ -54,7 +54,7 @@ class NnetComputerFromEg {
  private:
   const Nnet &nnet_;
   CachingOptimizingCompiler compiler_;
-  
+
 };
 
 }
@@ -80,7 +80,7 @@ int main(int argc, char *argv[]) {
         "e.g.:\n"
         "nnet3-discriminative-compute-from-egs --apply-exp=true 0.raw ark:1.degs ark:- | matrix-sum-rows ark:- ... \n"
         "See also: nnet3-compute nnet3-compute-from-egs\n";
-    
+
     bool binary_write = true,
         apply_exp = false;
     std::string use_gpu = "yes";
@@ -93,7 +93,7 @@ int main(int argc, char *argv[]) {
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
@@ -102,7 +102,7 @@ int main(int argc, char *argv[]) {
 #if HAVE_CUDA==1
     CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
-    
+
     std::string nnet_rxfilename = po.GetArg(1),
         examples_rspecifier = po.GetArg(2),
         matrix_wspecifier = po.GetArg(3);
@@ -113,10 +113,10 @@ int main(int argc, char *argv[]) {
     NnetComputerFromEg computer(nnet);
 
     int64 num_egs = 0;
-    
+
     SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
     BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
-    
+
     for (; !example_reader.Done(); example_reader.Next(), num_egs++) {
       Matrix<BaseFloat> output;
       NnetExample eg;
@@ -146,6 +146,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
-
diff --git a/src/nnet3bin/nnet3-discriminative-copy-egs.cc b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
index 831484ebb11..17dc2ee4e13 100644
--- a/src/nnet3bin/nnet3-discriminative-copy-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-copy-egs.cc
@@ -58,7 +58,6 @@ int main(int argc, char *argv[]) {
     bool random = false;
     int32 srand_seed = 0;
     int32 frame_shift = 0;
-    int32 truncate_deriv_weights = 0;
     BaseFloat keep_proportion = 1.0;
 
     ParseOptions po(usage);
@@ -74,9 +73,6 @@ int main(int argc, char *argv[]) {
                 "in the supervision data (excluding iVector data) - useful in "
                 "augmenting data.  Note, the outputs will remain at the closest "
                 "exact multiples of the frame subsampling factor");
-    po.Register("truncate-deriv-weights", &truncate_deriv_weights,
-                "If nonzero, the number of initial/final subsample frames that "
-                "will have their derivatives' weights set to zero.");
 
     po.Read(argc, argv);
 
@@ -106,7 +102,7 @@ int main(int argc, char *argv[]) {
       // count is normally 1; could be 0, or possibly >1.
       int32 count = GetCount(keep_proportion);
       std::string key = example_reader.Key();
-      if (frame_shift == 0 && truncate_deriv_weights == 0) {
+      if (frame_shift == 0) {
         const NnetDiscriminativeExample &eg = example_reader.Value();
         for (int32 c = 0; c < count; c++) {
           int32 index = (random ? Rand() : num_written) % num_outputs;
@@ -117,8 +113,6 @@ int main(int argc, char *argv[]) {
         NnetDiscriminativeExample eg = example_reader.Value();
         if (frame_shift != 0)
           ShiftDiscriminativeExampleTimes(frame_shift, exclude_names, &eg);
-        if (truncate_deriv_weights != 0)
-          TruncateDerivWeights(truncate_deriv_weights, &eg);
         for (int32 c = 0; c < count; c++) {
           int32 index = (random ? Rand() : num_written) % num_outputs;
           example_writers[index]->Write(key, eg);
@@ -136,4 +130,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-discriminative-get-egs.cc b/src/nnet3bin/nnet3-discriminative-get-egs.cc
index 786ed609a33..4a31876532f 100644
--- a/src/nnet3bin/nnet3-discriminative-get-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-get-egs.cc
@@ -32,155 +32,105 @@
 namespace kaldi {
 namespace nnet3 {
 
-/**
-   This function does all the processing for one utterance, and outputs the
-   supervision objects to 'example_writer'.  
-*/
-
-static bool ProcessFile(
-                        const discriminative::SplitDiscriminativeSupervisionOptions &config,
+// This function does all the processing for one utterance, and outputs the
+// examples to 'example_writer'.
+// returns true if we got as far as calling GetChunksForUtterance()
+// [in which case stats will be accumulated by class UtteranceSplitter]
+static bool ProcessFile(const discriminative::SplitDiscriminativeSupervisionOptions &config,
                         const TransitionModel &tmodel,
                         const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const discriminative::DiscriminativeSupervision &supervision,
                         const std::string &utt_id,
                         bool compress,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int32 frames_overlap_per_eg,
-                        int32 frame_subsampling_factor,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetDiscriminativeExampleWriter *example_writer) {
   KALDI_ASSERT(supervision.num_sequences == 1);
-  int32 num_feature_frames = feats.NumRows(),
-      num_output_frames = supervision.frames_per_sequence,
-      num_feature_frames_subsampled =
-                             (num_feature_frames + frame_subsampling_factor - 1)/
-                             frame_subsampling_factor;
-  if (num_output_frames != num_feature_frames_subsampled)
-    KALDI_ERR << "Mismatch in num-frames: discriminative supervision has "
-              << num_output_frames
-              << " versus features/frame_subsampling_factor = "
-              << num_feature_frames << " / " << frame_subsampling_factor
-              << ": check that --frame-subsampling-factor option is set "
-              << "the same as to discriminative-get-supervision.";
-
-  KALDI_ASSERT(frames_per_eg % frame_subsampling_factor == 0);
-
-  int32 frames_per_eg_subsampled = frames_per_eg / frame_subsampling_factor,
-      frames_overlap_subsampled = frames_overlap_per_eg / frame_subsampling_factor,
-      frames_shift_subsampled = frames_per_eg_subsampled - frames_overlap_subsampled;
-
-  if (frames_per_eg != -1 && num_feature_frames_subsampled < frames_per_eg_subsampled) {
-    KALDI_WARN << "No output for utterance " << utt_id
-               << " (num-frames=" << num_feature_frames
-               << ") because too short for --frames-per-eg="
-               << frames_per_eg;
-    return false;
-  }
+  int32 num_input_frames = feats.NumRows(),
+      num_output_frames = supervision.frames_per_sequence;
 
-  // we don't do any padding, as it would be a bit tricky to pad the discriminative training supervision.
-  // Instead we select ranges of frames that fully fit within the file;  these
-  // might slightly overlap with each other or have gaps.
-  std::vector<int32> range_starts_subsampled;
-  if (frames_per_eg != -1) {
-    chain::SplitIntoRanges(num_feature_frames_subsampled -
-                           frames_overlap_subsampled,
-                           frames_shift_subsampled,
-                           &range_starts_subsampled);
-  } else {
-    range_starts_subsampled.push_back(0);
-  }
-  // The 'deriv_weights' make sure we don't count frames twice, and also ensure
-  // that we tend to avoid having nonzero weights on the derivatives that are
-  // too close to the edge of the corresponding 'range' (these derivatives close
-  // to the edge are not as accurate as they could be, because when we split we
-  // don't know the correct alphas and betas).
-  std::vector<Vector<BaseFloat> > deriv_weights;
-  if (frames_per_eg != -1) {
-    chain::GetWeightsForRanges(frames_per_eg_subsampled,
-                        range_starts_subsampled,
-                        &deriv_weights);
-
-    if (range_starts_subsampled.empty()) {
-      KALDI_WARN << "No output for utterance " << utt_id
-                 << " (num-frames=" << num_feature_frames
-                 << ") because too short for --frames-per-eg="
-                 << frames_per_eg;
-      return false;
-    }
-  } else {
-    deriv_weights.push_back(Vector<BaseFloat>());
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames, num_output_frames))
+    return false;  // LengthsMatch() will have printed a warning.
+
+  std::vector<ChunkTimeInfo> chunks;
+
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
   }
 
-  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel, 
+  int32 frame_subsampling_factor = utt_splitter->Config().frame_subsampling_factor;
+
+  discriminative::DiscriminativeSupervisionSplitter splitter(config, tmodel,
                                                              supervision);
 
-  for (size_t i = 0; i < range_starts_subsampled.size(); i++) {
+  for (size_t c = 0; c < chunks.size(); c++) {
+    ChunkTimeInfo &chunk = chunks[c];
 
     NnetDiscriminativeExample nnet_discriminative_eg;
     nnet_discriminative_eg.outputs.resize(1);
-    int32 range_start_subsampled = range_starts_subsampled[i],
-        range_start = range_start_subsampled * frame_subsampling_factor;
-    
-    if (frames_per_eg != -1) {
-
-      discriminative::DiscriminativeSupervision supervision_part;
-
-      splitter.GetFrameRange(range_start_subsampled,
-                             frames_per_eg_subsampled,
-                             (i == 0 ? false : true),
-                             &supervision_part);
-
-      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
-                              // that the supervised part starts from frame 0.
-      NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
-                                                     deriv_weights[i],
-                                                     first_frame, 
-                                                     frame_subsampling_factor);
-      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
-    } else {
-      int32 first_frame = 0;  // we shift the time-indexes of all these parts so
-                              // that the supervised part starts from frame 0.
-      NnetDiscriminativeSupervision nnet_supervision("output", supervision,
-                                                     deriv_weights[i],
-                                                     first_frame, 
-                                                     frame_subsampling_factor);
-      nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
-    }
+
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+    discriminative::DiscriminativeSupervision supervision_part;
+
+    splitter.GetFrameRange(start_frame_subsampled,
+                           num_frames_subsampled,
+                           (c == 0 ? false : true),
+                           &supervision_part);
+
+    SubVector<BaseFloat> output_weights(
+        &(chunk.output_weights[0]),
+        static_cast<int32>(chunk.output_weights.size()));
+
+    int32 first_frame = 0;  // we shift the time-indexes of all these parts so
+                            // that the supervised part starts from frame 0.
+    NnetDiscriminativeSupervision nnet_supervision("output", supervision_part,
+                                                   output_weights,
+                                                   first_frame,
+                                                   frame_subsampling_factor);
+    nnet_discriminative_eg.outputs[0].Swap(&nnet_supervision);
 
     nnet_discriminative_eg.inputs.resize(ivector_feats != NULL ? 2 : 1);
 
-    int32 this_frames_per_eg = frames_per_eg != -1 ? frames_per_eg : supervision.frames_per_sequence;
 
-    int32 tot_frames = left_context + this_frames_per_eg + right_context;
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
+
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
 
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < this_frames_per_eg + right_context; j++) {
-      int32 t = range_start + j;
-      if (t < 0) t = 0;
-      if (t >= feats.NumRows()) t = feats.NumRows() - 1;
-      SubVector<BaseFloat> src(feats, t),
-          dest(input_frames, j + left_context);
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
+      if (t2 < 0) t2 = 0;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
+      SubVector<BaseFloat> src(feats, t2),
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
-    NnetIo input_io("input", - left_context,
-                    input_frames);
+
+    NnetIo input_io("input", -chunk.left_context, input_frames);
     nnet_discriminative_eg.inputs[0].Swap(&input_io);
 
     if (ivector_feats != NULL) {
       // if applicable, add the iVector feature.
-      // try to get closest frame to middle of window to get
-      // a representative iVector.
-      int32 closest_frame = range_start + this_frames_per_eg / 2;
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (closest_frame >= ivector_feats->NumRows())
-        closest_frame = ivector_feats->NumRows() - 1;
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       NnetIo ivector_io("ivector", 0, ivector);
       nnet_discriminative_eg.inputs[1].Swap(&ivector_io);
     }
@@ -189,13 +139,10 @@ static bool ProcessFile(
       nnet_discriminative_eg.Compress();
 
     std::ostringstream os;
-    os << utt_id << "-" << range_start;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += this_frames_per_eg;
-    *num_egs_written += 1;
-
     example_writer->Write(key, nnet_discriminative_eg);
   }
   return true;
@@ -214,145 +161,131 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Get frame-by-frame examples of data for nnet3+sequence neural network\n"
-        "training.  This involves breaking up utterances into pieces of a\n"
-        "fixed size.  Input will come from discriminative-get-supervision.\n"
+        "training.  This involves breaking up utterances into pieces of sizes\n"
+        "determined by the --num-frames option.\n"
         "\n"
         "Usage:  nnet3-discriminative-get-egs [options] <model> <features-rspecifier> "
-        "<discriminative-supervision-rspecifier> <egs-wspecifier>\n"
+        "<denominator-lattice-rspecifier> <numerator-alignment-rspecifier> <egs-wspecifier>\n"
         "\n"
         "An example [where $feats expands to the actual features]:\n"
-        "discriminative-get-supervision [args] | \\\n"
-        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=20 \\\n"
-        "  \"$feats\" ark,s,cs:- ark:degs.1.ark\n"
-        "Note: the --frame-subsampling-factor option must be the same as given to\n"
-        "discriminative-get-supervision.\n";
+        "  nnet3-discriminative-get-egs --left-context=25 --right-context=9 --num-frames=150,100,90 \\\n"
+        "  \"$feats\" \"ark,s,cs:gunzip -c lat.1.gz\" scp:ali.scp ark:degs.1.ark\n";
 
     bool compress = true;
-    int32 left_context = 0, right_context = 0, num_frames = 1,
-        num_frames_overlap = 0, length_tolerance = 100,
-        frame_subsampling_factor = 1;
+    int32 length_tolerance = 100, online_ivector_period = 1;
 
-    std::string ivector_rspecifier;
+    std::string online_ivector_rspecifier;
+
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
     discriminative::SplitDiscriminativeSupervisionOptions splitter_config;
 
     ParseOptions po(usage);
+
+    eg_config.Register(&po);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format (recommended)");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.  Will be rounded up to a multiple "
-                "of --frame-subsampling-factor.");
-    po.Register("num-frames-overlap", &num_frames_overlap, "Number of frames of "
-                "overlap between each example (could be useful in conjunction "
-                "--min-deriv-time and --max-deriv-time, to avoid wasting data). "
-                "Each time we shift by --num-frames minus --num-frames-overlap.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for --online-ivectors "
+                "option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of ivector "
                 "features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Used "
-                "if the frame-rate at the output will be less than the "
-                "frame-rate of the input");
-    
-    ParseOptions splitter_opts("supervision-splitter", &po);
-    splitter_config.Register(&splitter_opts);
+
+    splitter_config.Register(&po);
 
     po.Read(argc, argv);
 
-    if (po.NumArgs() != 4) {
+    if (po.NumArgs() != 5) {
       po.PrintUsage();
       exit(1);
     }
 
-    if (left_context < 0 || right_context < 0 ||
-        length_tolerance < 0 || frame_subsampling_factor <= 0)
-      KALDI_ERR << "One of the integer options is out of the allowed range.";
-
-    if (frame_subsampling_factor != 1)
-      RoundUpNumFrames(frame_subsampling_factor,
-                       &num_frames, &num_frames_overlap);
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
 
-    std::string model_wxfilename, feature_rspecifier,
-                supervision_rspecifier,
-                examples_wspecifier;
+    std::string model_wxfilename = po.GetArg(1),
+        feature_rspecifier = po.GetArg(2),
+        den_lat_rspecifier = po.GetArg(3),
+        num_ali_rspecifier = po.GetArg(4),
+        examples_wspecifier = po.GetArg(5);
 
-    model_wxfilename = po.GetArg(1);
-    feature_rspecifier = po.GetArg(2);
-    supervision_rspecifier = po.GetArg(3);
-    examples_wspecifier = po.GetArg(4);
 
     TransitionModel tmodel;
-    { 
+    {
       bool binary;
       Input ki(model_wxfilename, &binary);
       tmodel.Read(ki.Stream(), binary);
     }
 
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
-    discriminative::RandomAccessDiscriminativeSupervisionReader supervision_reader(
-        supervision_rspecifier);
+    RandomAccessLatticeReader den_lat_reader(den_lat_rspecifier);
+    RandomAccessInt32VectorReader ali_reader(num_ali_rspecifier);
     NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
 
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
+    int32 num_err = 0;
 
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const Matrix<BaseFloat> &feats = feat_reader.Value();
-      if (!supervision_reader.HasKey(key)) {
-        KALDI_WARN << "No supervision for key " << key;
+      if (!den_lat_reader.HasKey(key)) {
+        KALDI_WARN << "No denominator lattice for key " << key;
+        num_err++;
+      } else if (!ali_reader.HasKey(key)) {
+        KALDI_WARN << "No numerator alignment for key " << key;
         num_err++;
       } else {
-        const discriminative::DiscriminativeSupervision &supervision = supervision_reader.Value(key);
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        discriminative::DiscriminativeSupervision supervision;
+        if (!supervision.Initialize(ali_reader.Value(key),
+                                    den_lat_reader.Value(key),
+                                    1.0)) {
+          KALDI_WARN << "Failed to convert lattice to supervision "
+                     << "for utterance " << key;
+          num_err++;
+          continue;
+        }
+
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
-        if (ivector_feats != NULL &&
-            (std::abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
-        if (ProcessFile(splitter_config, tmodel,
-                        feats, ivector_feats, supervision,
-                        key, compress, left_context, right_context, num_frames,
-                        num_frames_overlap, frame_subsampling_factor,
-                        &num_frames_written, &num_egs_written,
-                        &example_writer))
-          num_done++;
-        else {
-          KALDI_WARN << "Failed to process utterance into nnet example "
-                     << "for key " << key;
+        if (!ProcessFile(splitter_config, tmodel,
+                         feats, online_ivector_feats, online_ivector_period,
+                         supervision, key, compress,
+                         &utt_splitter, &example_writer))
           num_err++;
-        }
       }
     }
-
-    KALDI_LOG << "Finished generating nnet3-discriminative examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " frames in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints diagnostics.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-discriminative-merge-egs.cc b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
index 5c386bd40b3..bc4cdfb2941 100644
--- a/src/nnet3bin/nnet3-discriminative-merge-egs.cc
+++ b/src/nnet3bin/nnet3-discriminative-merge-egs.cc
@@ -41,14 +41,10 @@ int main(int argc, char *argv[]) {
         "nnet3-discriminative-merge-egs --minibatch-size=128 ark:1.degs ark:- | nnet3-discriminative-train ... \n"
         "See also nnet3-discriminative-copy-egs\n";
 
-    bool compress = false;
-    int32 minibatch_size = 64;
+    ExampleMergingConfig merging_config("64");  // 64 is default minibatch size.
 
     ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
-                "when merging (see also --measure-output-frames)");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk");
+    merging_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -63,40 +59,17 @@ int main(int argc, char *argv[]) {
     SequentialNnetDiscriminativeExampleReader example_reader(examples_rspecifier);
     NnetDiscriminativeExampleWriter example_writer(examples_wspecifier);
 
-    std::vector<NnetDiscriminativeExample> examples;
-    examples.reserve(minibatch_size);
-
-    int64 num_read = 0, num_written = 0;
-    while (!example_reader.Done()) {
+    merging_config.ComputeDerived();
+    DiscriminativeExampleMerger merger(merging_config, &example_writer);
+    for (; !example_reader.Done(); example_reader.Next()) {
       const NnetDiscriminativeExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-
-      bool minibatch_ready =
-          static_cast<int32>(examples.size()) >= minibatch_size;
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (example_reader.Done() && !examples.empty())) {
-        NnetDiscriminativeExample merged_eg;
-        MergeDiscriminativeExamples(compress, &examples, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-      }
+      merger.AcceptExample(new NnetDiscriminativeExample(cur_eg));
     }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
+    // the merger itself prints the necessary diagnostics.
+    merger.Finish();
+    return merger.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
-
-
-
diff --git a/src/nnet3bin/nnet3-get-egs-dense-targets.cc b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
index 23bf8922a5b..54d607466b5 100644
--- a/src/nnet3bin/nnet3-get-egs-dense-targets.cc
+++ b/src/nnet3bin/nnet3-get-egs-dense-targets.cc
@@ -25,6 +25,7 @@
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -32,101 +33,121 @@ namespace nnet3 {
 
 static void ProcessFile(const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const MatrixBase<BaseFloat> &targets,
                         const std::string &utt_id,
                         bool compress,
                         int32 num_targets,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(targets.NumRows()));
-  
-  for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) {
-
-    // actual_frames_per_eg is the number of frames with actual targets.
-    // At the end of the file, we pad with the last frame repeated
-    // so that all examples have the same structure (prevents the need
-    // for recompilations).
-    // TODO: We might need to ignore the end of the file.
-    int32 actual_frames_per_eg = std::min(frames_per_eg,
-                                          feats.NumRows() - t);
-
-
-    int32 tot_frames = left_context + frames_per_eg + right_context;
-
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols());
-    
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
-      int32 t2 = j + t;
+  int32 num_input_frames = feats.NumRows();
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames,
+                                  targets.NumRows())) {
+    if (targets.NumRows() == 0)
+      return;
+    // normally we wouldn't process such an utterance but there may be
+    // situations when a small disagreement is acceptable.
+    KALDI_WARN << " .. processing this utterance anyway.";
+  }
+  KALDI_ASSERT(num_targets < 0 || targets.NumCols() == num_targets);
+
+  std::vector<ChunkTimeInfo> chunks;
+
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
+    return;
+  }
+
+  // 'frame_subsampling_factor' is not used in any recipes at the time of
+  // writing, this is being supported to unify the code with the 'chain' recipes
+  // and in case we need it for some reason in future.
+  int32 frame_subsampling_factor =
+      utt_splitter->Config().frame_subsampling_factor;
+
+  for (size_t c = 0; c < chunks.size(); c++) {
+    const ChunkTimeInfo &chunk = chunks[c];
+
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
+
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
+
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
       if (t2 < 0) t2 = 0;
-      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
       SubVector<BaseFloat> src(feats, t2),
-          dest(input_frames, j + left_context);
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
 
     NnetExample eg;
-    
+
     // call the regular input "input".
-    eg.io.push_back(NnetIo("input", - left_context,
-                           input_frames));
+    eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames));
 
-    // if applicable, add the iVector feature.
     if (ivector_feats != NULL) {
-      // try to get closest frame to middle of window to get
-      // a representative iVector.
-      int32 closest_frame = t + (actual_frames_per_eg / 2);
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (closest_frame >= ivector_feats->NumRows())
-        closest_frame = ivector_feats->NumRows() - 1;
+      // if applicable, add the iVector feature.
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       eg.io.push_back(NnetIo("ivector", 0, ivector));
     }
 
+    // Note: chunk.first_frame and chunk.num_frames will both be
+    // multiples of frame_subsampling_factor.
+    // We expect frame_subsampling_factor to usually be 1 for now.
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+    KALDI_ASSERT(start_frame_subsampled + num_frames_subsampled - 1 <
+                 targets.NumRows());
+
+
     // add the labels.
-    Matrix<BaseFloat> targets_dest(frames_per_eg, targets.NumCols());
-    for (int32 i = 0; i < actual_frames_per_eg; i++) {
+    Matrix<BaseFloat> targets_part(num_frames_subsampled, targets.NumCols());
+    for (int32 i = 0; i < num_frames_subsampled; i++) {
       // Copy the i^th row of the target matrix from the (t+i)^th row of the
       // input targets matrix
-      SubVector<BaseFloat> this_target_dest(targets_dest, i);
-      SubVector<BaseFloat> this_target_src(targets, t+i);
-      this_target_dest.CopyFromVec(this_target_src);
-    } 
-    
-    // Copy the last frame's target to the padded frames
-    for (int32 i = actual_frames_per_eg; i < frames_per_eg; i++) {
-      // Copy the i^th row of the target matrix from the last row of the 
-      // input targets matrix
-      KALDI_ASSERT(t + actual_frames_per_eg - 1 == feats.NumRows() - 1);
-      SubVector<BaseFloat> this_target_dest(targets_dest, i);
-      SubVector<BaseFloat> this_target_src(targets, t+actual_frames_per_eg-1);
+      int32 t = i + start_frame_subsampled;
+      if (t >= targets.NumRows())
+        t = targets.NumRows() - 1;
+      SubVector<BaseFloat> this_target_dest(targets_part, i);
+      SubVector<BaseFloat> this_target_src(targets, t);
       this_target_dest.CopyFromVec(this_target_src);
-    } 
+    }
 
     // push this created targets matrix into the eg
-    eg.io.push_back(NnetIo("output", 0, targets_dest));
-    
+    eg.io.push_back(NnetIo("output", 0, targets_part));
+
     if (compress)
       eg.Compress();
-      
+
     std::ostringstream os;
-    os << utt_id << "-" << t;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += actual_frames_per_eg;
-    *num_egs_written += 1;
-
     example_writer->Write(key, eg);
   }
 }
 
 
+
 } // namespace nnet2
 } // namespace kaldi
 
@@ -152,29 +173,31 @@ int main(int argc, char *argv[]) {
         "--right-context=9 --num-frames=8 \"$feats\" \\\n"
         "\"ark:copy-matrix ark:exp/snrs/snr.1.ark ark:- |\"\n"
         "   ark:- \n";
-        
+
 
     bool compress = true;
-    int32 num_targets = -1, left_context = 0, right_context = 0,
-        num_frames = 1, length_tolerance = 100;
-        
-    std::string ivector_rspecifier;
-    
+    int32 num_targets = -1, length_tolerance = 100, online_ivector_period = 1;
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
+
+    std::string online_ivector_rspecifier;
     ParseOptions po(usage);
+
+    eg_config.Register(&po);
     po.Register("compress", &compress, "If true, write egs in "
                 "compressed format.");
-    po.Register("num-targets", &num_targets, "Number of targets for the neural network");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as matrix.");
+    po.Register("num-targets", &num_targets, "Output dimension in egs, "
+                "only used to check targets have correct dim if supplied.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -182,8 +205,8 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
 
-    if (num_targets <= 0)
-      KALDI_ERR << "--num-targets options is required.";
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
 
     std::string feature_rspecifier = po.GetArg(1),
         matrix_rspecifier = po.GetArg(2),
@@ -193,11 +216,10 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
     RandomAccessBaseFloatMatrixReader matrix_reader(matrix_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
-    
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(online_ivector_rspecifier);
+
+    int32 num_err = 0;
+
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const Matrix<BaseFloat> &feats = feat_reader.Value();
@@ -207,52 +229,47 @@ int main(int argc, char *argv[]) {
       } else {
         const Matrix<BaseFloat> &target_matrix = matrix_reader.Value(key);
         if (target_matrix.NumRows() != feats.NumRows()) {
-          KALDI_WARN << "Target matrix has wrong size " 
+          KALDI_WARN << "Target matrix has wrong size "
                      << target_matrix.NumRows()
                      << " versus " << feats.NumRows();
           num_err++;
           continue;
         }
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
 
-        if (ivector_feats != NULL &&
-            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - online_ivector_feats->NumRows()) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
-          
-        ProcessFile(feats, ivector_feats, target_matrix, key, compress,
-                    num_targets, left_context, right_context, num_frames,
-                    &num_frames_written, &num_egs_written,
-                    &example_writer);
-        num_done++;
+
+        ProcessFile(feats, online_ivector_feats, online_ivector_period,
+                    target_matrix, key, compress, num_targets,
+                    &utt_splitter, &example_writer);
       }
     }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " egs in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
-
diff --git a/src/nnet3bin/nnet3-get-egs.cc b/src/nnet3bin/nnet3-get-egs.cc
index 75f264f1ceb..03623f02a07 100644
--- a/src/nnet3bin/nnet3-get-egs.cc
+++ b/src/nnet3bin/nnet3-get-egs.cc
@@ -19,98 +19,129 @@
 // limitations under the License.
 
 #include <sstream>
-
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
 #include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
 
 
-static void ProcessFile(const MatrixBase<BaseFloat> &feats,
+static bool ProcessFile(const MatrixBase<BaseFloat> &feats,
                         const MatrixBase<BaseFloat> *ivector_feats,
+                        int32 ivector_period,
                         const Posterior &pdf_post,
                         const std::string &utt_id,
                         bool compress,
                         int32 num_pdfs,
-                        int32 left_context,
-                        int32 right_context,
-                        int32 frames_per_eg,
-                        int64 *num_frames_written,
-                        int64 *num_egs_written,
+                        UtteranceSplitter *utt_splitter,
                         NnetExampleWriter *example_writer) {
-  KALDI_ASSERT(feats.NumRows() == static_cast<int32>(pdf_post.size()));
-  
-  for (int32 t = 0; t < feats.NumRows(); t += frames_per_eg) {
+  int32 num_input_frames = feats.NumRows();
+  if (!utt_splitter->LengthsMatch(utt_id, num_input_frames,
+                             static_cast<int32>(pdf_post.size())))
+    return false;  // LengthsMatch() will have printed a warning.
+
+  std::vector<ChunkTimeInfo> chunks;
 
-    // actual_frames_per_eg is the number of frames with nonzero
-    // posteriors.  At the end of the file we pad with zero posteriors
-    // so that all examples have the same structure (prevents the need
-    // for recompilations).
-    int32 actual_frames_per_eg = std::min(frames_per_eg,
-                                          feats.NumRows() - t);
+  utt_splitter->GetChunksForUtterance(num_input_frames, &chunks);
+
+  if (chunks.empty()) {
+    KALDI_WARN << "Not producing egs for utterance " << utt_id
+               << " because it is too short: "
+               << num_input_frames << " frames.";
+  }
 
+  // 'frame_subsampling_factor' is not used in any recipes at the time of
+  // writing, this is being supported to unify the code with the 'chain' recipes
+  // and in case we need it for some reason in future.
+  int32 frame_subsampling_factor =
+      utt_splitter->Config().frame_subsampling_factor;
 
-    int32 tot_frames = left_context + frames_per_eg + right_context;
+  for (size_t c = 0; c < chunks.size(); c++) {
+    const ChunkTimeInfo &chunk = chunks[c];
 
-    Matrix<BaseFloat> input_frames(tot_frames, feats.NumCols(), kUndefined);
-    
-    // Set up "input_frames".
-    for (int32 j = -left_context; j < frames_per_eg + right_context; j++) {
-      int32 t2 = j + t;
+    int32 tot_input_frames = chunk.left_context + chunk.num_frames +
+        chunk.right_context;
+
+    Matrix<BaseFloat> input_frames(tot_input_frames, feats.NumCols(),
+                                   kUndefined);
+
+    int32 start_frame = chunk.first_frame - chunk.left_context;
+    for (int32 t = start_frame; t < start_frame + tot_input_frames; t++) {
+      int32 t2 = t;
       if (t2 < 0) t2 = 0;
-      if (t2 >= feats.NumRows()) t2 = feats.NumRows() - 1;
+      if (t2 >= num_input_frames) t2 = num_input_frames - 1;
+      int32 j = t - start_frame;
       SubVector<BaseFloat> src(feats, t2),
-          dest(input_frames, j + left_context);
+          dest(input_frames, j);
       dest.CopyFromVec(src);
     }
 
     NnetExample eg;
-    
+
     // call the regular input "input".
-    eg.io.push_back(NnetIo("input", - left_context,
-                           input_frames));
+    eg.io.push_back(NnetIo("input", -chunk.left_context, input_frames));
 
-    // if applicable, add the iVector feature.
     if (ivector_feats != NULL) {
-      // try to get closest frame to middle of window to get
-      // a representative iVector.
-      int32 closest_frame = t + (actual_frames_per_eg / 2);
-      KALDI_ASSERT(ivector_feats->NumRows() > 0);
-      if (closest_frame >= ivector_feats->NumRows())
-        closest_frame = ivector_feats->NumRows() - 1;
+      // if applicable, add the iVector feature.
+      // choose iVector from a random frame in the chunk
+      int32 ivector_frame = RandInt(start_frame,
+                                    start_frame + num_input_frames - 1),
+          ivector_frame_subsampled = ivector_frame / ivector_period;
+      if (ivector_frame_subsampled < 0)
+        ivector_frame_subsampled = 0;
+      if (ivector_frame_subsampled >= ivector_feats->NumRows())
+        ivector_frame_subsampled = ivector_feats->NumRows() - 1;
       Matrix<BaseFloat> ivector(1, ivector_feats->NumCols());
-      ivector.Row(0).CopyFromVec(ivector_feats->Row(closest_frame));
+      ivector.Row(0).CopyFromVec(ivector_feats->Row(ivector_frame_subsampled));
       eg.io.push_back(NnetIo("ivector", 0, ivector));
     }
 
-    // add the labels.
-    Posterior labels(frames_per_eg);
-    for (int32 i = 0; i < actual_frames_per_eg; i++)
-      labels[i] = pdf_post[t + i];
-    // remaining posteriors for frames are empty.
+    // Note: chunk.first_frame and chunk.num_frames will both be
+    // multiples of frame_subsampling_factor.
+    int32 start_frame_subsampled = chunk.first_frame / frame_subsampling_factor,
+        num_frames_subsampled = chunk.num_frames / frame_subsampling_factor;
+
+    KALDI_ASSERT(start_frame_subsampled + num_frames_subsampled - 1 <
+                 static_cast<int32>(pdf_post.size()));
+
+    // Note: in all current cases there is no subsampling of output-frames going
+    // on (--frame-subsampling-factor=1), so you could read
+    // 'num_frames_subsampled' as just 'num_frames'.
+    Posterior labels(num_frames_subsampled);
+
+    // TODO: it may be that using these weights is not actually helpful (with
+    // chain training, it was not), and that setting them all to 1 is better.
+    // We could add a boolean option to this program to control that; but I
+    // don't want to add such an option if experiments show that it is not
+    // helpful.
+    for (int32 i = 0; i < num_frames_subsampled; i++) {
+      int32 t = i + start_frame_subsampled;
+      labels[i] = pdf_post[t];
+      for (std::vector<std::pair<int32, BaseFloat> >::iterator
+               iter = labels[i].begin(); iter != labels[i].end(); ++iter)
+        iter->second *= chunk.output_weights[i];
+    }
+
     eg.io.push_back(NnetIo("output", num_pdfs, 0, labels));
-    
+
     if (compress)
       eg.Compress();
-      
+
     std::ostringstream os;
-    os << utt_id << "-" << t;
+    os << utt_id << "-" << chunk.first_frame;
 
     std::string key = os.str(); // key is <utt_id>-<frame_id>
 
-    *num_frames_written += actual_frames_per_eg;
-    *num_egs_written += 1;
-
     example_writer->Write(key, eg);
   }
+  return true;
 }
 
-
-} // namespace nnet2
+} // namespace nnet3
 } // namespace kaldi
 
 int main(int argc, char *argv[]) {
@@ -137,30 +168,34 @@ int main(int argc, char *argv[]) {
         "nnet3-get-egs --num-pdfs=2658 --left-context=12 --right-context=9 --num-frames=8 \"$feats\"\\\n"
         "\"ark:gunzip -c exp/nnet/ali.1.gz | ali-to-pdf exp/nnet/1.nnet ark:- ark:- | ali-to-post ark:- ark:- |\" \\\n"
         "   ark:- \n";
-        
+
 
     bool compress = true;
-    int32 num_pdfs = -1, left_context = 0, right_context = 0,
-        num_frames = 1, length_tolerance = 100;
-        
-    std::string ivector_rspecifier;
-    
+    int32 num_pdfs = -1, length_tolerance = 100,
+        online_ivector_period = 1;
+
+    ExampleGenerationConfig eg_config;  // controls num-frames,
+                                        // left/right-context, etc.
+
+    std::string online_ivector_rspecifier;
+
     ParseOptions po(usage);
+
     po.Register("compress", &compress, "If true, write egs in "
-                "compressed format.");
+                "compressed format (recommended).");
     po.Register("num-pdfs", &num_pdfs, "Number of pdfs in the acoustic "
                 "model");
-    po.Register("left-context", &left_context, "Number of frames of left "
-                "context the neural net requires.");
-    po.Register("right-context", &right_context, "Number of frames of right "
-                "context the neural net requires.");
-    po.Register("num-frames", &num_frames, "Number of frames with labels "
-                "that each example contains.");
-    po.Register("ivectors", &ivector_rspecifier, "Rspecifier of ivector "
-                "features, as a matrix.");
+    po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
+                "--online-ivectors option, for back compatibility");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
+                "ivector features, as a matrix.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
     po.Register("length-tolerance", &length_tolerance, "Tolerance for "
                 "difference in num-frames between feat and ivector matrices");
-    
+    eg_config.Register(&po);
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 3) {
@@ -170,7 +205,9 @@ int main(int argc, char *argv[]) {
 
     if (num_pdfs <= 0)
       KALDI_ERR << "--num-pdfs options is required.";
-    
+
+    eg_config.ComputeDerived();
+    UtteranceSplitter utt_splitter(eg_config);
 
     std::string feature_rspecifier = po.GetArg(1),
         pdf_post_rspecifier = po.GetArg(2),
@@ -180,11 +217,11 @@ int main(int argc, char *argv[]) {
     SequentialBaseFloatMatrixReader feat_reader(feature_rspecifier);
     RandomAccessPosteriorReader pdf_post_reader(pdf_post_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
-    RandomAccessBaseFloatMatrixReader ivector_reader(ivector_rspecifier);
-    
-    int32 num_done = 0, num_err = 0;
-    int64 num_frames_written = 0, num_egs_written = 0;
-    
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+
+    int32 num_err = 0;
+
     for (; !feat_reader.Done(); feat_reader.Next()) {
       std::string key = feat_reader.Key();
       const Matrix<BaseFloat> &feats = feat_reader.Value();
@@ -199,43 +236,41 @@ int main(int argc, char *argv[]) {
           num_err++;
           continue;
         }
-        const Matrix<BaseFloat> *ivector_feats = NULL;
-        if (!ivector_rspecifier.empty()) {
-          if (!ivector_reader.HasKey(key)) {
+        const Matrix<BaseFloat> *online_ivector_feats = NULL;
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(key)) {
             KALDI_WARN << "No iVectors for utterance " << key;
             num_err++;
             continue;
           } else {
             // this address will be valid until we call HasKey() or Value()
             // again.
-            ivector_feats = &(ivector_reader.Value(key));
+            online_ivector_feats = &(online_ivector_reader.Value(key));
           }
         }
 
-        if (ivector_feats != NULL &&
-            (abs(feats.NumRows() - ivector_feats->NumRows()) > length_tolerance
-             || ivector_feats->NumRows() == 0)) {
+        if (online_ivector_feats != NULL &&
+            (abs(feats.NumRows() - (online_ivector_feats->NumRows() *
+                                    online_ivector_period)) > length_tolerance
+             || online_ivector_feats->NumRows() == 0)) {
           KALDI_WARN << "Length difference between feats " << feats.NumRows()
-                     << " and iVectors " << ivector_feats->NumRows()
+                     << " and iVectors " << online_ivector_feats->NumRows()
                      << "exceeds tolerance " << length_tolerance;
           num_err++;
           continue;
         }
-          
-        ProcessFile(feats, ivector_feats, pdf_post, key, compress,
-                    num_pdfs, left_context, right_context, num_frames,
-                    &num_frames_written, &num_egs_written,
-                    &example_writer);
-        num_done++;
+
+        if (!ProcessFile(feats, online_ivector_feats, online_ivector_period,
+                         pdf_post, key, compress, num_pdfs,
+                         &utt_splitter, &example_writer))
+            num_err++;
       }
     }
-
-    KALDI_LOG << "Finished generating examples, "
-              << "successfully processed " << num_done
-              << " feature files, wrote " << num_egs_written << " examples, "
-              << " with " << num_frames_written << " egs in total; "
-              << num_err << " files had errors.";
-    return (num_egs_written == 0 || num_err > num_done ? 1 : 0);
+    if (num_err > 0)
+      KALDI_WARN << num_err << " utterances had errors and could "
+          "not be processed.";
+    // utt_splitter prints stats in its destructor.
+    return utt_splitter.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
diff --git a/src/nnet3bin/nnet3-latgen-faster-looped.cc b/src/nnet3bin/nnet3-latgen-faster-looped.cc
new file mode 100644
index 00000000000..6e6f5af4410
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-faster-looped.cc
@@ -0,0 +1,270 @@
+// nnet3bin/nnet3-latgen-faster-looped.cc
+
+// Copyright 2012-2016   Johns Hopkins University (author: Daniel Povey)
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "nnet3/decodable-simple-looped.h"
+#include "base/timer.h"
+
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model.\n"
+        "[this version uses the 'looped' computation, which may be slightly faster for\n"
+        "many architectures, but should not be used for backwards-recurrent architectures\n"
+        "such as BLSTMs.\n"
+        "Usage: nnet3-latgen-faster-looped [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig config;
+    NnetSimpleLoopedComputationOptions decodable_opts;
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    config.Register(&po);
+    decodable_opts.Register(&po);
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 6) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        lattice_wspecifier = po.GetArg(4),
+        words_wspecifier = po.GetOptArg(5),
+        alignment_wspecifier = po.GetOptArg(6);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_in_filename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    }
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    Int32VectorWriter words_writer(words_wspecifier);
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                 &am_nnet);
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+      // Input FST is just one FST, not a table of FSTs.
+      VectorFst<StdArc> *decode_fst = fst::ReadFstKaldi(fst_in_str);
+      timer.Reset();
+
+      {
+        LatticeFasterDecoder decoder(*decode_fst, config);
+
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          const Matrix<BaseFloat> &features (feature_reader.Value());
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+          const Matrix<BaseFloat> *online_ivectors = NULL;
+          const Vector<BaseFloat> *ivector = NULL;
+          if (!ivector_rspecifier.empty()) {
+            if (!ivector_reader.HasKey(utt)) {
+              KALDI_WARN << "No iVector available for utterance " << utt;
+              num_fail++;
+              continue;
+            } else {
+              ivector = &ivector_reader.Value(utt);
+            }
+          }
+          if (!online_ivector_rspecifier.empty()) {
+            if (!online_ivector_reader.HasKey(utt)) {
+              KALDI_WARN << "No online iVector available for utterance " << utt;
+              num_fail++;
+              continue;
+            } else {
+              online_ivectors = &online_ivector_reader.Value(utt);
+            }
+          }
+
+
+          DecodableAmNnetSimpleLooped nnet_decodable(
+              decodable_info, trans_model, features, ivector, online_ivectors,
+              online_ivector_period);
+
+          double like;
+          if (DecodeUtteranceLatticeFaster(
+                  decoder, nnet_decodable, trans_model, word_syms, utt,
+                  decodable_opts.acoustic_scale, determinize, allow_partial,
+                  &alignment_writer, &words_writer, &compact_lattice_writer,
+                  &lattice_writer,
+                  &like)) {
+            tot_like += like;
+            frame_count += nnet_decodable.NumFramesReady();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      SequentialTableReader<fst::VectorFstHolder> fst_reader(fst_in_str);
+      RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      for (; !fst_reader.Done(); fst_reader.Next()) {
+        std::string utt = fst_reader.Key();
+        if (!feature_reader.HasKey(utt)) {
+          KALDI_WARN << "Not decoding utterance " << utt
+                     << " because no features available.";
+          num_fail++;
+          continue;
+        }
+        const Matrix<BaseFloat> &features = feature_reader.Value(utt);
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          num_fail++;
+          continue;
+        }
+
+        LatticeFasterDecoder decoder(fst_reader.Value(), config);
+
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            num_fail++;
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        DecodableAmNnetSimpleLooped nnet_decodable(
+            decodable_info, trans_model, features, ivector, online_ivectors,
+            online_ivector_period);
+
+        double like;
+        if (DecodeUtteranceLatticeFaster(
+                decoder, nnet_decodable, trans_model, word_syms, utt,
+                decodable_opts.acoustic_scale, determinize, allow_partial,
+                &alignment_writer, &words_writer, &compact_lattice_writer,
+                &lattice_writer, &like)) {
+          tot_like += like;
+          frame_count += nnet_decodable.NumFramesReady();
+          num_success++;
+        } else num_fail++;
+      }
+    }
+
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed * 100.0 / input_frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count <<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index 5a090acb5b5..6bd5cd7c453 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -177,7 +177,7 @@ int main(int argc, char *argv[]) {
                   &lattice_writer,
                   &like)) {
             tot_like += like;
-            frame_count += features.NumRows();
+            frame_count += nnet_decodable.NumFramesReady();
             num_success++;
           } else num_fail++;
         }
@@ -236,20 +236,24 @@ int main(int argc, char *argv[]) {
                 &alignment_writer, &words_writer, &compact_lattice_writer,
                 &lattice_writer, &like)) {
           tot_like += like;
-          frame_count += features.NumRows();
+          frame_count += nnet_decodable.NumFramesReady();
           num_success++;
         } else num_fail++;
       }
     }
 
+    kaldi::int64 input_frame_count =
+        frame_count * decodable_opts.frame_subsampling_factor;
+
     double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
+              << (elapsed * 100.0 / input_frame_count);
     KALDI_LOG << "Done " << num_success << " utterances, failed for "
               << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
+    KALDI_LOG << "Overall log-likelihood per frame is "
+              << (tot_like / frame_count) << " over "
+              << frame_count << " frames.";
 
     delete word_syms;
     if (num_success != 0) return 0;
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 0e40de8aeae..081c0a21c7b 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -59,24 +59,10 @@ int main(int argc, char *argv[]) {
         "nnet3-merge-egs --minibatch-size=512 ark:1.egs ark:- | nnet3-train-simple ... \n"
         "See also nnet3-copy-egs\n";
 
-    bool compress = false;
-    int32 minibatch_size = 512;
-    bool measure_output_frames = true;
-    bool discard_partial_minibatches = false;
-
     ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of minibatches "
-                "when merging (see also --measure-output-frames)");
-    po.Register("measure-output-frames", &measure_output_frames, "If true, "
-                "--minibatch-size is a target number of total output frames; if "
-                "false, --minibatch-size is the number of input examples to "
-                "merge.");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk)");
-    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
-                "discard any partial minibatches of 'uneven' size that may be "
-                "encountered at the end; 'true' is recommended, to avoid "
-                "incurring compilation costs.");
+
+    ExampleMergingConfig merging_config;
+    merging_config.Register(&po);
 
     po.Read(argc, argv);
 
@@ -88,46 +74,24 @@ int main(int argc, char *argv[]) {
     std::string examples_rspecifier = po.GetArg(1),
         examples_wspecifier = po.GetArg(2);
 
+    merging_config.ComputeDerived();
+
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
 
-    std::vector<NnetExample> examples;
-    examples.reserve(minibatch_size);
+    ExampleMerger merger(merging_config, &example_writer);
 
-    int32 cur_num_output_frames = 0;
-
-    int64 num_read = 0, num_written = 0;
-    while (!example_reader.Done()) {
+    for (; !example_reader.Done(); example_reader.Next()) {
       const NnetExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-      cur_num_output_frames += NumOutputIndexes(cur_eg);
-      bool minibatch_ready =
-          (measure_output_frames ?
-           cur_num_output_frames >= minibatch_size :
-           static_cast<int32>(examples.size()) >= minibatch_size);
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (!discard_partial_minibatches &&
-                              (example_reader.Done() && !examples.empty()))) {
-        NnetExample merged_eg;
-        MergeExamples(examples, compress, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-        cur_num_output_frames = 0;
-      }
+      merger.AcceptExample(new NnetExample(cur_eg));
     }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
+    // the merger itself prints the necessary diagnostics.
+    merger.Finish();
+    return merger.ExitStatus();
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
     return -1;
   }
 }
+
+
diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc
index 897f01a8241..89fd26be86f 100644
--- a/src/nnetbin/cuda-gpu-available.cc
+++ b/src/nnetbin/cuda-gpu-available.cc
@@ -24,9 +24,21 @@
 
 #include "base/kaldi-common.h"
 #include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-matrix.h"
 
 using namespace kaldi;
 
+#if HAVE_CUDA == 1
+/**
+ * With incorrect CUDA setup, this will trigger "invalid device function" error.
+ */
+void TestGpuComputation() {
+  CuMatrix<BaseFloat> m(100,100);
+  m.SetRandn();
+  m.ApplySoftMaxPerRow(m);
+}
+#endif
+
 int main(int argc, char *argv[]) try {
   char hostname[100] = "UNKNOWN-HOSTNAME";
 #ifndef _MSC_VER
@@ -34,14 +46,33 @@ int main(int argc, char *argv[]) try {
     KALDI_WARN << "Cannot get hostname, " << strerror(errno);
   }
 #endif
-  std::cerr
-    << "### IS CUDA GPU AVAILABLE? '"
-    << hostname << "' ###" << std::endl;
+  KALDI_LOG << std::endl << std::endl
+    << "### IS CUDA GPU AVAILABLE? '" << hostname << "' ###";
 #if HAVE_CUDA == 1
   CuDevice::Instantiate().SelectGpuId("yes");
-  std::cerr
-    << "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ###"
-    << std::endl;
+  fprintf(stderr, "### HURRAY, WE GOT A CUDA GPU FOR COMPUTATION!!! ##\n\n");
+  fprintf(stderr, "### Testing CUDA setup with a small computation "
+                  "(setup = cuda-toolkit + gpu-driver + kaldi):\n");
+  // the test of setup by computation,
+  try {
+    TestGpuComputation();
+  } catch (const std::exception &e) {
+    fprintf(stderr, "%s\n", e.what());
+    KALDI_LOG << "...\n"
+      << "### The CUDA setup is wrong! "
+      << "(\"invalid device function\" == problem with 'compute capability' "
+      << "in compiled kaldi)\n"
+      << "### Before posting the error to forum, please try following:\n"
+      << "### 1) update kaldi & cuda-toolkit (& GPU driver),\n"
+      << "### 2) re-run 'src/configure',\n"
+      << "### 3) re-compile kaldi by 'make clean; make -j depend; make -j'\n"
+      << "###\n"
+      << "### If the problem persists, please send us your:\n"
+      << "### - GPU model name, cuda-toolkit version, driver version "
+      << "(run nvidia-smi), variable $(CUDA_ARCH) from src/kaldi.mk";
+    return -1;
+  }
+  fprintf(stderr, "### Test OK!\n");
   return 0;
 #else
   std::cerr
@@ -51,26 +82,17 @@ int main(int argc, char *argv[]) try {
   return 1;
 #endif
 } catch (const std::exception &e) {
-  std::cerr << e.what();
-  std::cerr
-    << "### WE DID NOT GET A CUDA GPU!!! ###" << std::endl
-    << "### If it's your 1st experiment with CUDA, try reinstalling "
-    << "'CUDA toolkit' from NVidia web (it contains the drivers)."
-    << std::endl
-    << "### In other cases run 'nvidia-smi' in terminal "
-    << "(gets installed with display drivers) :"
-    << std::endl
-    << "### - Check that you see your GPU."
-    << std::endl
-    << "### - Bad GPUs are reporting error or disappear from the list "
-    << "until reboot."
-    << std::endl
-    << "### - Check 'Memory-Usage' and 'GPU fan', "
-    << "which will tell you if the GPU was taken by other process."
-    << std::endl
-    << "### - Check there is same version of 'NVIDIA-SMI' and "
-    << "'Driver', and that it is not too old for your GPU."
-    << std::endl;
+  fprintf(stderr, "%s\n", e.what());
+  KALDI_LOG << "...\n"
+    << "### WE DID NOT GET A CUDA GPU!!! ###\n"
+    << "### If your system has a 'free' CUDA GPU, try re-installing "
+    << "latest 'CUDA toolkit' from NVidia (this updates GPU drivers too).\n"
+    << "### Otherwise 'nvidia-smi' shows the status of GPUs:\n"
+    << "### - The versions should match ('NVIDIA-SMI' and 'Driver Version'), "
+    << "otherwise reboot or reload kernel module,\n"
+    << "### - The GPU should be unused "
+    << "(no 'process' in list, low 'memory-usage' (<100MB), low 'gpu-fan' (<30%)),\n"
+    << "### - You should see your GPU (burnt GPUs may disappear from the list until reboot),";
   return -1;
 }
 
diff --git a/src/nnetbin/nnet-train-mmi-sequential.cc b/src/nnetbin/nnet-train-mmi-sequential.cc
index 02a94ff3979..2554d64287a 100644
--- a/src/nnetbin/nnet-train-mmi-sequential.cc
+++ b/src/nnetbin/nnet-train-mmi-sequential.cc
@@ -272,7 +272,7 @@ int main(int argc, char *argv[]) {
         }
       }
       // get the lattice length and times of states,
-      vector<int32> state_times;
+      std::vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
       // check duration of den. lattice,
       if (max_time != mat.NumRows()) {
diff --git a/src/nnetbin/nnet-train-mpe-sequential.cc b/src/nnetbin/nnet-train-mpe-sequential.cc
index 76b4110ca28..2ba14527142 100644
--- a/src/nnetbin/nnet-train-mpe-sequential.cc
+++ b/src/nnetbin/nnet-train-mpe-sequential.cc
@@ -276,7 +276,7 @@ int main(int argc, char *argv[]) {
         }
       }
       // get the lattice length and times of states
-      vector<int32> state_times;
+      std::vector<int32> state_times;
       int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
       // check for temporal length of denominator lattices
       if (max_time != mat.NumRows()) {
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index fcdab88408e..cdfc5948571 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -387,7 +387,7 @@ OnlineSilenceWeighting::OnlineSilenceWeighting(
     const OnlineSilenceWeightingConfig &config):
     trans_model_(trans_model), config_(config),
     num_frames_output_and_correct_(0) {
-  vector<int32> silence_phones;
+  std::vector<int32> silence_phones;
   SplitStringToIntegers(config.silence_phones_str, ":,", false,
                         &silence_phones);
   for (size_t i = 0; i < silence_phones.size(); i++)
@@ -514,7 +514,7 @@ void OnlineSilenceWeighting::GetDeltaWeights(
       frames_out = static_cast<int32>(frame_info_.size()) - begin_frame;
   // frames_out is the number of frames we will output.
   KALDI_ASSERT(frames_out >= 0);
-  vector<BaseFloat> frame_weight(frames_out, 1.0);
+  std::vector<BaseFloat> frame_weight(frames_out, 1.0);
   // we will frame_weight to the value silence_weight for silence frames and for
   // transition-ids that repeat with duration > max_state_duration.  Frames newer
   // than the most recent traceback will get a weight equal to the weight for the
diff --git a/src/online2/online-nnet2-decoding-threaded.cc b/src/online2/online-nnet2-decoding-threaded.cc
index 09c9a4f6f0b..feb711df904 100644
--- a/src/online2/online-nnet2-decoding-threaded.cc
+++ b/src/online2/online-nnet2-decoding-threaded.cc
@@ -26,7 +26,7 @@
 namespace kaldi {
 
 ThreadSynchronizer::ThreadSynchronizer():
-    abort_(false), 
+    abort_(false),
     producer_waiting_(false),
     consumer_waiting_(false),
     num_errors_(0) {
@@ -67,8 +67,8 @@ bool ThreadSynchronizer::UnlockSuccess(ThreadType t) {
       producer_semaphore_.Signal();
       producer_waiting_ = false;
     }
-    
-  }  
+
+  }
   mutex_.Unlock();
   return !abort_;
 }
@@ -192,7 +192,7 @@ void SingleUtteranceNnet2DecoderThreaded::AcceptWaveform(
     KALDI_ASSERT(sampling_rate == sampling_rate_);
   }
   num_samples_received_ += wave_part.Dim();
-  
+
   if (wave_part.Dim() == 0) return;
   if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kProducer)) {
     KALDI_ERR << "Failure locking mutex: decoding aborted.";
@@ -310,9 +310,9 @@ void SingleUtteranceNnet2DecoderThreaded::GetAdaptationState(
     OnlineIvectorExtractorAdaptationState *adaptation_state) {
   feature_pipeline_mutex_.Lock();  // If this blocks, it shouldn't be for very long.
   feature_pipeline_.GetAdaptationState(adaptation_state);
-  feature_pipeline_mutex_.Unlock();  // If this blocks, it won't be for very long.  
+  feature_pipeline_mutex_.Unlock();  // If this blocks, it won't be for very long.
 }
-  
+
 void SingleUtteranceNnet2DecoderThreaded::GetLattice(
     bool end_of_utterance,
     CompactLattice *clat,
@@ -324,7 +324,7 @@ void SingleUtteranceNnet2DecoderThreaded::GetLattice(
   if (final_relative_cost != NULL)
     *final_relative_cost = decoder_.FinalRelativeCost();
   if (decoder_.NumFramesDecoded() == 0) {
-    const_cast<Mutex&>(decoder_mutex_).Unlock();    
+    const_cast<Mutex&>(decoder_mutex_).Unlock();
     clat->SetFinal(clat->AddState(),
                    CompactLatticeWeight::One());
     return;
@@ -332,7 +332,7 @@ void SingleUtteranceNnet2DecoderThreaded::GetLattice(
   Lattice raw_lat;
   decoder_.GetRawLattice(&raw_lat, end_of_utterance);
   const_cast<Mutex&>(decoder_mutex_).Unlock();
-  
+
   if (!config_.decoder_opts.determinize_lattice)
     KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
 
@@ -354,7 +354,7 @@ void SingleUtteranceNnet2DecoderThreaded::GetBestPath(
     best_path->DeleteStates();
     best_path->SetFinal(best_path->AddState(),
                         LatticeWeight::One());
-    if (final_relative_cost != NULL)    
+    if (final_relative_cost != NULL)
       *final_relative_cost = std::numeric_limits<BaseFloat>::infinity();
   } else {
     decoder_.GetBestPath(best_path,
@@ -447,7 +447,7 @@ void SingleUtteranceNnet2DecoderThreaded::ProcessLoglikes(
 // locked feature_pipeline_mutex_.
 bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
     int32 num_frames_consumed) {
-  
+
   int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
       num_frames_usable = num_frames_ready - num_frames_consumed;
   bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
@@ -457,7 +457,7 @@ bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
   } else {
     if (num_frames_usable >= config_.nnet_batch_size)
       return true;  // We don't need more data yet.
-    
+
     // Now try to get more data, if we can.
     if (!waveform_synchronizer_.Lock(ThreadSynchronizer::kConsumer)) {
       return false;
@@ -506,12 +506,12 @@ bool SingleUtteranceNnet2DecoderThreaded::FeatureComputation(
 bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
   // if any of the Lock/Unlock functions return false, it's because AbortAllThreads()
   // was called.
-  
+
   // This object is responsible for keeping track of the context, and avoiding
   // re-computing things we've already computed.
   bool pad_input = true;
   nnet2::NnetOnlineComputer computer(am_nnet_.GetNnet(), pad_input);
-  
+
   // we declare the following as CuVector just to enable GPU support, but
   // we expect this code to be run on CPU in the normal case.
   CuVector<BaseFloat> log_inv_prior(am_nnet_.Priors());
@@ -525,7 +525,7 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
   // has produced, which may be less than num_frames_consumed due to the
   // right-context of the network.
   int32 num_frames_consumed = 0, num_frames_output = 0;
-  
+
   while (true) {
     bool last_time = false;
 
@@ -536,19 +536,21 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
       return false;
     }
     // take care of silence weighting.
-    if (silence_weighting_.Active()) {
+    if (silence_weighting_.Active() &&
+        feature_pipeline_.IvectorFeature() != NULL) {
       silence_weighting_mutex_.Lock();
       std::vector<std::pair<int32, BaseFloat> > delta_weights;
-      silence_weighting_.GetDeltaWeights(feature_pipeline_.NumFramesReady(),
-                                         &delta_weights);
+      silence_weighting_.GetDeltaWeights(
+          feature_pipeline_.IvectorFeature()->NumFramesReady(),
+          &delta_weights);
       silence_weighting_mutex_.Unlock();
-      feature_pipeline_.UpdateFrameWeights(delta_weights);
+      feature_pipeline_.IvectorFeature()->UpdateFrameWeights(delta_weights);
     }
-    
+
     int32 num_frames_ready = feature_pipeline_.NumFramesReady(),
         num_frames_usable = num_frames_ready - num_frames_consumed;
     bool features_done = feature_pipeline_.IsLastFrame(num_frames_ready - 1);
-      
+
     int32 num_frames_evaluate = std::min<int32>(num_frames_usable,
                                                 config_.nnet_batch_size);
 
@@ -563,10 +565,10 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
       }
     }
     /****** End locking of feature pipeline mutex. ******/
-    feature_pipeline_mutex_.Unlock();  
+    feature_pipeline_mutex_.Unlock();
 
     CuMatrix<BaseFloat> cu_loglikes;
-    
+
     if (feats.NumRows() == 0) {
       if (features_done) {
         // flush out the last few frames.  Note: this is the only place from
@@ -587,7 +589,7 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
       num_frames_consumed += cu_feats.NumRows();
       ProcessLoglikes(log_inv_prior, &cu_loglikes);
     }
-    
+
     Matrix<BaseFloat> loglikes;
     loglikes.Swap(&cu_loglikes);  // If we don't have a GPU (and not having a
                                   // GPU is the normal expected use-case for
@@ -596,8 +598,8 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
 
 
     // OK, at this point we may have some newly created log-likes and we want to
-    // give them to the decoding thread.  
-    
+    // give them to the decoding thread.
+
     int32 num_loglike_frames = loglikes.NumRows();
 
     if (num_loglike_frames != 0) {  // if we need to output some loglikes...
@@ -644,7 +646,7 @@ bool SingleUtteranceNnet2DecoderThreaded::RunNnetEvaluationInternal() {
     }
   }
 }
-  
+
 
 bool SingleUtteranceNnet2DecoderThreaded::RunDecoderSearchInternal() {
   int32 num_frames_decoded = 0;  // this is just a copy of decoder_->NumFramesDecoded();
diff --git a/src/online2/online-nnet2-feature-pipeline.cc b/src/online2/online-nnet2-feature-pipeline.cc
index fe79dbfd114..510c401fba2 100644
--- a/src/online2/online-nnet2-feature-pipeline.cc
+++ b/src/online2/online-nnet2-feature-pipeline.cc
@@ -168,12 +168,6 @@ void OnlineNnet2FeaturePipeline::AcceptWaveform(
     pitch_->AcceptWaveform(sampling_rate, waveform);
 }
 
-void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
-    const std::vector<std::pair<int32, BaseFloat> > &delta_weights) {
-  if (ivector_feature_ != NULL)
-    ivector_feature_->UpdateFrameWeights(delta_weights);
-}
-
 void OnlineNnet2FeaturePipeline::InputFinished() {
   base_feature_->InputFinished();
   if (pitch_)
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index 77746bbd634..d8f933a090d 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -52,6 +52,9 @@ namespace kaldi {
 ///
 /// Most of the logic for the actual iVector estimation is in \ref
 /// online-ivector-feature.h, this header contains mostly glue.
+///
+/// Although the name of this header mentions nnet2, actually the code is
+/// used in the online decoding with nnet3 also.
 
 
 /// This configuration class is to set up OnlineNnet2FeaturePipelineInfo, which
@@ -74,7 +77,7 @@ struct OnlineNnet2FeaturePipelineConfig {
   // the following contains the type of options that you could give to
   // compute-and-process-kaldi-pitch-feats.
   std::string online_pitch_config;
-  
+
   // The configuration variables in ivector_extraction_config relate to the
   // iVector extractor and options related to it, see type
   // OnlineIvectorExtractionConfig.
@@ -87,7 +90,7 @@ struct OnlineNnet2FeaturePipelineConfig {
 
   OnlineNnet2FeaturePipelineConfig():
       feature_type("mfcc"), add_pitch(false) { }
-      
+
 
   void Register(OptionsItf *opts) {
     opts->Register("feature-type", &feature_type,
@@ -125,11 +128,11 @@ struct OnlineNnet2FeaturePipelineInfo {
 
   OnlineNnet2FeaturePipelineInfo(
       const OnlineNnet2FeaturePipelineConfig &config);
-  
+
   BaseFloat FrameShiftInSeconds() const;
 
   std::string feature_type;  // "mfcc" or "plp" or "fbank"
-  
+
   MfccOptions mfcc_opts;  // options for MFCC computation,
                           // if feature_type == "mfcc"
   PlpOptions plp_opts;  // Options for PLP computation, if feature_type == "plp"
@@ -153,7 +156,7 @@ struct OnlineNnet2FeaturePipelineInfo {
   // it's the kind of thing you might want to play with directly
   // on the command line instead of inside sub-config-files.
   OnlineSilenceWeightingConfig silence_weighting_config;
-  
+
   int32 IvectorDim() { return ivector_extractor_info.extractor.IvectorDim(); }
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(OnlineNnet2FeaturePipelineInfo);
@@ -198,7 +201,7 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// Copy().
   void SetAdaptationState(
       const OnlineIvectorExtractorAdaptationState &adaptation_state);
-  
+
 
   /// Get the adaptation state; you may want to call this before destroying this
   /// object, to get adaptation state that can be used to improve decoding of
@@ -208,7 +211,7 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   void GetAdaptationState(
       OnlineIvectorExtractorAdaptationState *adaptation_state) const;
 
-  
+
   /// Accept more data to process.  It won't actually process it until you call
   /// GetFrame() [probably indirectly via (decoder).AdvanceDecoding()], when you
   /// call this function it will just copy it).  sampling_rate is necessary just
@@ -216,12 +219,6 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   void AcceptWaveform(BaseFloat sampling_rate,
                       const VectorBase<BaseFloat> &waveform);
 
-  /// This is used in case you are downweighting silence in the iVector
-  /// estimation using the decoder traceback.
-  void UpdateFrameWeights(
-      const std::vector<std::pair<int32, BaseFloat> > &delta_weights);
-
-
   BaseFloat FrameShiftInSeconds() const { return info_.FrameShiftInSeconds(); }
 
   /// If you call InputFinished(), it tells the class you won't be providing any
@@ -231,13 +228,28 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   /// rescoring the lattices, this may not be much of an issue.
   void InputFinished();
 
+  // This function returns the ivector-extracting part of the feature pipeline
+  // (or NULL if iVectors are not being used); the pointer is owned here and not
+  // given to the caller.  This function is used in nnet3, and also in the
+  // silence-weighting code used to exclude silence from the iVector estimation.
+  OnlineIvectorFeature *IvectorFeature() {
+    return ivector_feature_;
+  }
+
+  // This function returns the part of the feature pipeline that would be given
+  // as the primary (non-iVector) input to the neural network in nnet3
+  // applications.
+ OnlineFeatureInterface *InputFeature() {
+    return feature_plus_optional_pitch_;
+  }
+
   virtual ~OnlineNnet2FeaturePipeline();
  private:
 
   const OnlineNnet2FeaturePipelineInfo &info_;
 
   OnlineBaseFeature *base_feature_;        // MFCC/PLP/filterbank
-  
+
   OnlinePitchFeature *pitch_;              // Raw pitch, if used
   OnlineProcessPitch *pitch_feature_;  // Processed pitch, if pitch used.
 
@@ -245,15 +257,15 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   // feature_plus_pitch_ is the base_feature_ appended (OnlineAppendFeature)
   /// with pitch_feature_, if used; otherwise, points to the same address as
   /// base_feature_.
-  OnlineFeatureInterface *feature_plus_optional_pitch_;  
-  
+  OnlineFeatureInterface *feature_plus_optional_pitch_;
+
   OnlineIvectorFeature *ivector_feature_;  // iVector feature, if used.
 
   // final_feature_ is feature_plus_optional_pitch_ appended
   // (OnlineAppendFeature) with ivector_feature_, if ivector_feature_ is used;
   // otherwise, points to the same address as feature_plus_optional_pitch_.
   OnlineFeatureInterface *final_feature_;
- 
+
   // we cache the feature dimension, to save time when calling Dim().
   int32 dim_;
 };
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index 8dd366166c0..ff74c07f10c 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -25,16 +25,17 @@
 namespace kaldi {
 
 SingleUtteranceNnet3Decoder::SingleUtteranceNnet3Decoder(
-    const OnlineNnet3DecodingConfig &config,
-    const TransitionModel &tmodel,
-    const nnet3::AmNnetSimple &am_model,
+    const LatticeFasterDecoderConfig &decoder_opts,
+    const TransitionModel &trans_model,
+    const nnet3::DecodableNnetSimpleLoopedInfo &info,
     const fst::Fst<fst::StdArc> &fst,
-    OnlineFeatureInterface *feature_pipeline):
-    config_(config),
-    feature_pipeline_(feature_pipeline),
-    tmodel_(tmodel),
-    decodable_(am_model, tmodel, config.decodable_opts, feature_pipeline),
-    decoder_(fst, config.decoder_opts) {
+    OnlineNnet2FeaturePipeline *features):
+    decoder_opts_(decoder_opts),
+    input_feature_frame_shift_in_seconds_(features->FrameShiftInSeconds()),
+    trans_model_(trans_model),
+    decodable_(trans_model_, info,
+               features->InputFeature(), features->IvectorFeature()),
+    decoder_(fst, decoder_opts_) {
   decoder_.InitDecoding();
 }
 
@@ -57,12 +58,12 @@ void SingleUtteranceNnet3Decoder::GetLattice(bool end_of_utterance,
   Lattice raw_lat;
   decoder_.GetRawLattice(&raw_lat, end_of_utterance);
 
-  if (!config_.decoder_opts.determinize_lattice)
+  if (!decoder_opts_.determinize_lattice)
     KALDI_ERR << "--determinize-lattice=false option is not supported at the moment";
 
-  BaseFloat lat_beam = config_.decoder_opts.lattice_beam;
+  BaseFloat lat_beam = decoder_opts_.lattice_beam;
   DeterminizeLatticePhonePrunedWrapper(
-      tmodel_, &raw_lat, lat_beam, clat, config_.decoder_opts.det_opts);
+      trans_model_, &raw_lat, lat_beam, clat, decoder_opts_.det_opts);
 }
 
 void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
@@ -72,12 +73,12 @@ void SingleUtteranceNnet3Decoder::GetBestPath(bool end_of_utterance,
 
 bool SingleUtteranceNnet3Decoder::EndpointDetected(
     const OnlineEndpointConfig &config) {
-  int32 subsample = decodable_.FrameSubsamplingFactor();
-  return kaldi::EndpointDetected(config, tmodel_,
-                                 feature_pipeline_->FrameShiftInSeconds() * subsample,
-                                 decoder_);  
+  BaseFloat output_frame_shift =
+      input_feature_frame_shift_in_seconds_ *
+      decodable_.FrameSubsamplingFactor();
+  return kaldi::EndpointDetected(config, trans_model_,
+                                 output_frame_shift, decoder_);
 }
 
 
 }  // namespace kaldi
-
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index 788c713080b..1888b71dbf1 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -26,12 +26,13 @@
 #include <vector>
 #include <deque>
 
-#include "nnet3/online-nnet3-decodable-simple.h"
+#include "nnet3/decodable-online-looped.h"
 #include "matrix/matrix-lib.h"
 #include "util/common-utils.h"
 #include "base/kaldi-error.h"
 #include "itf/online-feature-itf.h"
 #include "online2/online-endpoint.h"
+#include "online2/online-nnet2-feature-pipeline.h"
 #include "decoder/lattice-faster-online-decoder.h"
 #include "hmm/transition-model.h"
 #include "hmm/posterior.h"
@@ -41,40 +42,21 @@ namespace kaldi {
 /// @{
 
 
-
-
-
-// This configuration class contains the configuration classes needed to create
-// the class SingleUtteranceNnet3Decoder.  The actual command line program
-// requires other configs that it creates separately, and which are not included
-// here: namely, OnlineNnet2FeaturePipelineConfig and OnlineEndpointConfig.
-struct OnlineNnet3DecodingConfig {
-  
-  LatticeFasterDecoderConfig decoder_opts;
-  nnet3::DecodableNnet3OnlineOptions decodable_opts;
-  
-  OnlineNnet3DecodingConfig() {  decodable_opts.acoustic_scale = 0.1; }
-  
-  void Register(OptionsItf *opts) {
-    decoder_opts.Register(opts);
-    decodable_opts.Register(opts);
-  }
-};
-
 /**
    You will instantiate this class when you want to decode a single
    utterance using the online-decoding setup for neural nets.
 */
 class SingleUtteranceNnet3Decoder {
  public:
-  // Constructor.  The feature_pipeline_ pointer is not owned in this
-  // class, it's owned externally.
-  SingleUtteranceNnet3Decoder(const OnlineNnet3DecodingConfig &config,
-                              const TransitionModel &tmodel,
-                              const nnet3::AmNnetSimple &am_model,
+
+  // Constructor. The pointer 'features' is not being given to this class to own
+  // and deallocate, it is owned externally.
+  SingleUtteranceNnet3Decoder(const LatticeFasterDecoderConfig &decoder_opts,
+                              const TransitionModel &trans_model,
+                              const nnet3::DecodableNnetSimpleLoopedInfo &info,
                               const fst::Fst<fst::StdArc> &fst,
-                              OnlineFeatureInterface *feature_pipeline);
-  
+                              OnlineNnet2FeaturePipeline *features);
+
   /// advance the decoding as far as we can.
   void AdvanceDecoding();
 
@@ -84,7 +66,7 @@ class SingleUtteranceNnet3Decoder {
   void FinalizeDecoding();
 
   int32 NumFramesDecoded() const;
-  
+
   /// Gets the lattice.  The output lattice has any acoustic scaling in it
   /// (which will typically be desirable in an online-decoding context); if you
   /// want an un-scaled lattice, scale it using ScaleLattice() with the inverse
@@ -92,7 +74,7 @@ class SingleUtteranceNnet3Decoder {
   /// final-probs to be included.
   void GetLattice(bool end_of_utterance,
                   CompactLattice *clat) const;
-  
+
   /// Outputs an FST corresponding to the single best path through the current
   /// lattice. If "use_final_probs" is true AND we reached the final-state of
   /// the graph then it will include those as final-probs, else it will treat
@@ -106,23 +88,27 @@ class SingleUtteranceNnet3Decoder {
   bool EndpointDetected(const OnlineEndpointConfig &config);
 
   const LatticeFasterOnlineDecoder &Decoder() const { return decoder_; }
-  
+
   ~SingleUtteranceNnet3Decoder() { }
  private:
 
-  OnlineNnet3DecodingConfig config_;
+  const LatticeFasterDecoderConfig &decoder_opts_;
+
+  // this is remembered from the constructor; it's ultimately
+  // derived from calling FrameShiftInSeconds() on the feature pipeline.
+  BaseFloat input_feature_frame_shift_in_seconds_;
 
-  OnlineFeatureInterface *feature_pipeline_;
+  // we need to keep a reference to the transition model around only because
+  // it's needed by the endpointing code.
+  const TransitionModel &trans_model_;
+
+  nnet3::DecodableAmNnetLoopedOnline decodable_;
 
-  const TransitionModel &tmodel_;
-  
-  nnet3::DecodableNnet3SimpleOnline decodable_;
-  
   LatticeFasterOnlineDecoder decoder_;
-  
+
 };
 
-  
+
 /// @} End of "addtogroup onlinedecoding"
 
 }  // namespace kaldi
diff --git a/src/online2bin/online2-wav-nnet2-latgen-faster.cc b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
index ad8f323aea1..08e2c64995a 100644
--- a/src/online2bin/online2-wav-nnet2-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet2-latgen-faster.cc
@@ -40,10 +40,10 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   }
   CompactLattice best_path_clat;
   CompactLatticeShortestPath(clat, &best_path_clat);
-  
+
   Lattice best_path_lat;
   ConvertLattice(best_path_clat, &best_path_lat);
-  
+
   double likelihood;
   LatticeWeight weight;
   int32 num_frames;
@@ -57,7 +57,7 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
                 << " frames.";
-             
+
   if (word_syms != NULL) {
     std::cerr << utt << ' ';
     for (size_t i = 0; i < words.size(); i++) {
@@ -76,10 +76,10 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
-    
+
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
-    
+
     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet2 setup), with optional iVector-based speaker adaptation and\n"
@@ -92,22 +92,22 @@ int main(int argc, char *argv[]) {
         "you want to decode utterance by utterance.\n"
         "See egs/rm/s5/local/run_online_decoding_nnet2.sh for example\n"
         "See also online2-wav-nnet2-latgen-threaded\n";
-    
+
     ParseOptions po(usage);
-    
+
     std::string word_syms_rxfilename;
-    
+
     OnlineEndpointConfig endpoint_config;
 
     // feature_config includes configuration for the iVector adaptation,
     // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;  
+    OnlineNnet2FeaturePipelineConfig feature_config;
     OnlineNnet2DecodingConfig nnet2_decoding_config;
 
     BaseFloat chunk_length_secs = 0.05;
     bool do_endpointing = false;
     bool online = true;
-    
+
     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we process.  Set to <= 0 "
                 "to use all input in one chunk.");
@@ -126,24 +126,24 @@ int main(int argc, char *argv[]) {
                 "--chunk-length=-1.");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.");
-    
+
     feature_config.Register(&po);
     nnet2_decoding_config.Register(&po);
     endpoint_config.Register(&po);
-    
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string nnet2_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);
-    
+
     OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
 
     if (!online) {
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
-    
+
     TransitionModel trans_model;
     nnet2::AmNnet nnet;
     {
@@ -160,25 +160,25 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       nnet.Read(ki.Stream(), binary);
     }
-    
+
     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldi(fst_rxfilename);
-    
+
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
-    
+
     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);
-    
+
     OnlineTimingStats timing_stats;
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -202,14 +202,14 @@ int main(int argc, char *argv[]) {
         OnlineSilenceWeighting silence_weighting(
             trans_model,
             feature_info.silence_weighting_config);
-        
+
         SingleUtteranceNnet2Decoder decoder(nnet2_decoding_config,
                                             trans_model,
                                             nnet,
                                             *decode_fst,
                                             &feature_pipeline);
         OnlineTimer decoding_timer(utt);
-        
+
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
@@ -218,15 +218,15 @@ int main(int argc, char *argv[]) {
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }
-        
+
         int32 samp_offset = 0;
         std::vector<std::pair<int32, BaseFloat> > delta_weights;
-        
+
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
-          
+
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);
 
@@ -236,16 +236,19 @@ int main(int argc, char *argv[]) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }
-    
-          if (silence_weighting.Active()) {
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
-            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
-                                              &delta_weights);
-            feature_pipeline.UpdateFrameWeights(delta_weights);
+            silence_weighting.GetDeltaWeights(
+                feature_pipeline.IvectorFeature()->NumFramesReady(),
+                &delta_weights);
+            feature_pipeline.IvectorFeature()->UpdateFrameWeights(
+                delta_weights);
           }
-          
+
           decoder.AdvanceDecoding();
-          
+
           if (do_endpointing && decoder.EndpointDetected(endpoint_config))
             break;
         }
@@ -254,16 +257,16 @@ int main(int argc, char *argv[]) {
         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat);
-        
+
         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);
-        
+
         decoding_timer.OutputStats(&timing_stats);
-        
+
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
-        
+
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
             1.0 / nnet2_decoding_config.decodable_opts.acoustic_scale;
@@ -275,7 +278,7 @@ int main(int argc, char *argv[]) {
       }
     }
     timing_stats.Print(online);
-    
+
     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
diff --git a/src/online2bin/online2-wav-nnet3-latgen-faster.cc b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
index 740c9e2221b..62204460159 100644
--- a/src/online2bin/online2-wav-nnet3-latgen-faster.cc
+++ b/src/online2bin/online2-wav-nnet3-latgen-faster.cc
@@ -41,10 +41,10 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   }
   CompactLattice best_path_clat;
   CompactLatticeShortestPath(clat, &best_path_clat);
-  
+
   Lattice best_path_lat;
   ConvertLattice(best_path_clat, &best_path_lat);
-  
+
   double likelihood;
   LatticeWeight weight;
   int32 num_frames;
@@ -58,7 +58,7 @@ void GetDiagnosticsAndPrintOutput(const std::string &utt,
   KALDI_VLOG(2) << "Likelihood per frame for utterance " << utt << " is "
                 << (likelihood / num_frames) << " over " << num_frames
                 << " frames.";
-             
+
   if (word_syms != NULL) {
     std::cerr << utt << ' ';
     for (size_t i = 0; i < words.size(); i++) {
@@ -77,10 +77,10 @@ int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
     using namespace fst;
-    
+
     typedef kaldi::int32 int32;
     typedef kaldi::int64 int64;
-    
+
     const char *usage =
         "Reads in wav file(s) and simulates online decoding with neural nets\n"
         "(nnet3 setup), with optional iVector-based speaker adaptation and\n"
@@ -91,22 +91,22 @@ int main(int argc, char *argv[]) {
         "<spk2utt-rspecifier> <wav-rspecifier> <lattice-wspecifier>\n"
         "The spk2utt-rspecifier can just be <utterance-id> <utterance-id> if\n"
         "you want to decode utterance by utterance.\n";
-    
+
     ParseOptions po(usage);
-    
+
     std::string word_syms_rxfilename;
-    
-    OnlineEndpointConfig endpoint_config;
 
-    // feature_config includes configuration for the iVector adaptation,
+    // feature_opts includes configuration for the iVector adaptation,
     // as well as the basic features.
-    OnlineNnet2FeaturePipelineConfig feature_config;
-    OnlineNnet3DecodingConfig nnet3_decoding_config;
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
 
     BaseFloat chunk_length_secs = 0.18;
     bool do_endpointing = false;
     bool online = true;
-    
+
     po.Register("chunk-length", &chunk_length_secs,
                 "Length of chunk size in seconds, that we process.  Set to <= 0 "
                 "to use all input in one chunk.");
@@ -125,32 +125,34 @@ int main(int argc, char *argv[]) {
                 "--chunk-length=-1.");
     po.Register("num-threads-startup", &g_num_threads,
                 "Number of threads used when initializing iVector extractor.");
-    
-    feature_config.Register(&po);
-    nnet3_decoding_config.Register(&po);
-    endpoint_config.Register(&po);
-    
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 5) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string nnet3_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         spk2utt_rspecifier = po.GetArg(3),
         wav_rspecifier = po.GetArg(4),
         clat_wspecifier = po.GetArg(5);
-    
-    OnlineNnet2FeaturePipelineInfo feature_info(feature_config);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
 
     if (!online) {
       feature_info.ivector_extractor_info.use_most_recent_ivector = true;
       feature_info.ivector_extractor_info.greedy_ivector_extractor = true;
       chunk_length_secs = -1.0;
     }
-    
+
     TransitionModel trans_model;
     nnet3::AmNnetSimple am_nnet;
     {
@@ -159,25 +161,32 @@ int main(int argc, char *argv[]) {
       trans_model.Read(ki.Stream(), binary);
       am_nnet.Read(ki.Stream(), binary);
     }
-    
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+
     fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldi(fst_rxfilename);
-    
+
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_rxfilename != "")
       if (!(word_syms = fst::SymbolTable::ReadText(word_syms_rxfilename)))
         KALDI_ERR << "Could not read symbol table from file "
                   << word_syms_rxfilename;
-    
+
     int32 num_done = 0, num_err = 0;
     double tot_like = 0.0;
     int64 num_frames = 0;
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessTableReader<WaveHolder> wav_reader(wav_rspecifier);
     CompactLatticeWriter clat_writer(clat_wspecifier);
-    
+
     OnlineTimingStats timing_stats;
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -201,14 +210,12 @@ int main(int argc, char *argv[]) {
         OnlineSilenceWeighting silence_weighting(
             trans_model,
             feature_info.silence_weighting_config);
-        
-        SingleUtteranceNnet3Decoder decoder(nnet3_decoding_config,
-                                            trans_model,
-                                            am_nnet,
-                                            *decode_fst,
-                                            &feature_pipeline);
+
+        SingleUtteranceNnet3Decoder decoder(decoder_opts, trans_model,
+                                            decodable_info,
+                                            *decode_fst, &feature_pipeline);
         OnlineTimer decoding_timer(utt);
-        
+
         BaseFloat samp_freq = wave_data.SampFreq();
         int32 chunk_length;
         if (chunk_length_secs > 0) {
@@ -217,15 +224,15 @@ int main(int argc, char *argv[]) {
         } else {
           chunk_length = std::numeric_limits<int32>::max();
         }
-        
+
         int32 samp_offset = 0;
         std::vector<std::pair<int32, BaseFloat> > delta_weights;
-        
+
         while (samp_offset < data.Dim()) {
           int32 samp_remaining = data.Dim() - samp_offset;
           int32 num_samp = chunk_length < samp_remaining ? chunk_length
                                                          : samp_remaining;
-          
+
           SubVector<BaseFloat> wave_part(data, samp_offset, num_samp);
           feature_pipeline.AcceptWaveform(samp_freq, wave_part);
 
@@ -235,17 +242,18 @@ int main(int argc, char *argv[]) {
             // no more input. flush out last frames
             feature_pipeline.InputFinished();
           }
-    
-          if (silence_weighting.Active()) {
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
             silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
             silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
                                               &delta_weights);
-            feature_pipeline.UpdateFrameWeights(delta_weights);
+            feature_pipeline.IvectorFeature()->UpdateFrameWeights(delta_weights);
           }
-          
+
           decoder.AdvanceDecoding();
-          
-          if (do_endpointing && decoder.EndpointDetected(endpoint_config))
+
+          if (do_endpointing && decoder.EndpointDetected(endpoint_opts))
             break;
         }
         decoder.FinalizeDecoding();
@@ -253,19 +261,19 @@ int main(int argc, char *argv[]) {
         CompactLattice clat;
         bool end_of_utterance = true;
         decoder.GetLattice(end_of_utterance, &clat);
-        
+
         GetDiagnosticsAndPrintOutput(utt, word_syms, clat,
                                      &num_frames, &tot_like);
-        
+
         decoding_timer.OutputStats(&timing_stats);
-        
+
         // In an application you might avoid updating the adaptation state if
         // you felt the utterance had low confidence.  See lat/confidence.h
         feature_pipeline.GetAdaptationState(&adaptation_state);
-        
+
         // we want to output the lattice with un-scaled acoustics.
         BaseFloat inv_acoustic_scale =
-            1.0 / nnet3_decoding_config.decodable_opts.acoustic_scale;
+            1.0 / decodable_opts.acoustic_scale;
         ScaleLattice(AcousticLatticeScale(inv_acoustic_scale), &clat);
 
         clat_writer.Write(utt, clat);
@@ -274,7 +282,7 @@ int main(int argc, char *argv[]) {
       }
     }
     timing_stats.Print(online);
-    
+
     KALDI_LOG << "Decoded " << num_done << " utterances, "
               << num_err << " with errors.";
     KALDI_LOG << "Overall likelihood per frame was " << (tot_like / num_frames)
diff --git a/src/onlinebin/online-wav-gmm-decode-faster.cc b/src/onlinebin/online-wav-gmm-decode-faster.cc
index e5d54b80db5..fe7c6d6b974 100644
--- a/src/onlinebin/online-wav-gmm-decode-faster.cc
+++ b/src/onlinebin/online-wav-gmm-decode-faster.cc
@@ -41,7 +41,7 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Reads in wav file(s) and simulates online decoding.\n"
-        "Writes .tra and .ali files for WER computation. Utterance "
+        "Writes integerized-text and .ali files for WER computation. Utterance "
         "segmentation is done on-the-fly.\n"
         "Feature splicing/LDA transform is used, if the optional(last) argument "
         "is given.\n"
diff --git a/src/util/kaldi-holder-inl.h b/src/util/kaldi-holder-inl.h
index 4297af9a2e2..5768d4c6b03 100644
--- a/src/util/kaldi-holder-inl.h
+++ b/src/util/kaldi-holder-inl.h
@@ -97,6 +97,7 @@ template<class KaldiType> class KaldiObjectHolder {
   }
 
   void Swap(KaldiObjectHolder<T> *other) {
+    // the t_ values are pointers so this is a shallow swap.
     std::swap(t_, other->t_);
   }
 
diff --git a/src/util/stl-utils.h b/src/util/stl-utils.h
index d37e4d2d203..a1506f557a7 100644
--- a/src/util/stl-utils.h
+++ b/src/util/stl-utils.h
@@ -20,22 +20,10 @@
 #ifndef KALDI_UTIL_STL_UTILS_H_
 #define KALDI_UTIL_STL_UTILS_H_
 
-#ifdef _MSC_VER
 #include <unordered_map>
 #include <unordered_set>
 using std::unordered_map;
 using std::unordered_set;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-#include <unordered_set>
-using std::unordered_map;
-using std::unordered_set;
-#else
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
-using std::tr1::unordered_map;
-using std::tr1::unordered_set;
-#endif
 
 #include <algorithm>
 #include <map>
@@ -228,7 +216,7 @@ void CopyVectorToVector(const std::vector<A> &vec_in, std::vector<B> *vec_out) {
 /// A hashing function-object for vectors.
 template<typename Int>
 struct VectorHasher {  // hashing function for vector<Int>.
-  size_t operator()(const std::vector<Int> &x) const {
+  size_t operator()(const std::vector<Int> &x) const noexcept {
     size_t ans = 0;
     typename std::vector<Int>::const_iterator iter = x.begin(), end = x.end();
     for (; iter != end; ++iter) {
@@ -245,22 +233,22 @@ struct VectorHasher {  // hashing function for vector<Int>.
 };
 
 /// A hashing function-object for pairs of ints
-template<typename Int>
+template<typename Int1, typename Int2 = Int1>
 struct PairHasher {  // hashing function for pair<int>
-  size_t operator()(const std::pair<Int, Int> &x) const {
-    return x.first + x.second * kPrime;
+  size_t operator()(const std::pair<Int1, Int2> &x) const noexcept {
+    // 7853 was chosen at random from a list of primes.
+    return x.first + x.second * 7853;
   }
   PairHasher() {  // Check we're instantiated with an integer type.
-    KALDI_ASSERT_IS_INTEGER_TYPE(Int);
+    KALDI_ASSERT_IS_INTEGER_TYPE(Int1);
+    KALDI_ASSERT_IS_INTEGER_TYPE(Int2);
   }
- private:
-  static const int kPrime = 7853;
 };
 
 
 /// A hashing function object for strings.
 struct StringHasher {  // hashing function for std::string
-  size_t operator()(const std::string &str) const {
+  size_t operator()(const std::string &str) const noexcept {
     size_t ans = 0, len = str.length();
     const char *c = str.c_str(), *end = c + len;
     for (; c != end; c++) {
@@ -329,4 +317,3 @@ inline void MergePairVectorSumming(std::vector<std::pair<I, F> > *vec) {
 }  // namespace kaldi
 
 #endif  // KALDI_UTIL_STL_UTILS_H_
-
diff --git a/src/util/timer.h b/src/util/timer.h
deleted file mode 100644
index 3b92b48b603..00000000000
--- a/src/util/timer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// util/timer.h
-
-// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//  http://www.apache.org/licenses/LICENSE-2.0
-
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-// We are temporarily leaving this file to forward #includes to
-// base-timer.h.  Its use is deprecated; you should directrly
-// #include base/timer.h
-#ifndef KALDI_UTIL_TIMER_H_
-#define KALDI_UTIL_TIMER_H_
-#pragma message warning: please do not include util/timer.h, \
-  include base/timer.h(it has been moved)
-#include "base/timer.h"
-#endif  // KALDI_UTIL_TIMER_H_
diff --git a/tools/INSTALL b/tools/INSTALL
index b13d45826bd..1ca33f9c515 100644
--- a/tools/INSTALL
+++ b/tools/INSTALL
@@ -1,28 +1,32 @@
-
-To install the most important prerequisites for Kaldi:
-
- first do
+To check the prerequisites for Kaldi, first run
 
   extras/check_dependencies.sh
 
-to see if there are any system-level installations or modifications you need to do.
-Check the output carefully: there are some things that will make your life a lot
-easier if you fix them at this stage.
+and see if there are any system-level installations you need to do. Check the
+output carefully. There are some things that will make your life a lot easier
+if you fix them at this stage. If your system default C++ compiler is not
+supported, you can do the check with another compiler by setting the CXX
+environment variable, e.g.
+
+  CXX=g++-4.8 extras/check_dependencies.sh
 
 Then run
 
   make
 
+which by default will install ATLAS headers, OpenFst, SCTK and sph2pipe.
+OpenFst requires a relatively recent C++ compiler with C++11 support, e.g.
+g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system default
+compiler does not have adequate support for C++11, you can specify a C++11
+compliant compiler as a command argument, e.g.
+
+  make CXX=g++-4.8
+
 If you have multiple CPUs and want to speed things up, you can do a parallel
-build by supplying the "-j" option to make, e.g. to use 4 CPUs:
+build by supplying the "-j" option to make, e.g. to use 4 CPUs
 
   make -j 4
 
-By default, Kaldi builds against OpenFst-1.3.4. If you want to build against
-OpenFst-1.4, edit the Makefile in this folder. Note that this change requires
-a relatively new compiler with C++11 support, e.g. gcc >= 4.6, clang >= 3.0.
-
 In extras/, there are also various scripts to install extra bits and pieces that
 are used by individual example scripts.  If an example script needs you to run
 one of those scripts, it will tell you what to do.
-
diff --git a/tools/Makefile b/tools/Makefile
index 9fdc35da402..c84468ffef5 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,30 +1,18 @@
 # SHELL += -x
 
 CXX = g++
-# CXX = clang++  # Uncomment this line to build with Clang.
-CC = gcc    # used for sph2pipe
-
-
-OPENFST_VERSION = 1.3.4
-# Uncomment the next line to build with OpenFst-1.4.1.
-# OPENFST_VERSION = 1.4.1
-# Note: OpenFst >= 1.4 requires C++11 support, hence you will need to use a
-# relatively recent C++ compiler, e.g. gcc >= 4.6, clang >= 3.0.
-
-# On Mac OS 10.9+, clang defaults to the new c++ standard library libc++.
-# Since OpenFst-1.3 uses stuff from the tr1 namespace, we need to tell clang
-# to use libstdc++ instead.
-ifeq ($(OPENFST_VERSION), 1.3.4)
-  COMPILER = $(shell $(CXX) -v 2>&1 )
-  ifeq ($(findstring clang,$(COMPILER)),clang)
-    CXXFLAGS += -stdlib=libstdc++
-    LDFLAGS += -stdlib=libstdc++
-  endif
-else
-  ifneq ($(OPENFST_VERSION), 1.4.1)
-    $(error OpenFst version $(OPENFST_VERSION) is not supported. \
-            Supported versions: 1.3.4, 1.4.1)
-  endif
+CC = gcc         # used for sph2pipe
+# CXX = clang++  # Uncomment these lines
+# CC = clang     # to build with Clang.
+
+# Note: OpenFst requires a relatively recent C++ compiler with C++11 support,
+# e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3.
+OPENFST_VERSION = 1.6.2
+
+OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10600)","1")
+    $(error OpenFst-$(OPENFST_VERSION) is not supported. \
+            Supported versions: >= 1.6.0)
 endif
 
 all: check_required_programs sph2pipe atlas sclite openfst
@@ -69,15 +57,14 @@ openfst: openfst_compiled openfst-$(OPENFST_VERSION)/lib
 
 .PHONY: openfst_compiled
 openfst_compiled: openfst-$(OPENFST_VERSION)/Makefile
-	cd openfst-$(OPENFST_VERSION)/ && \
-	$(MAKE) install
+	$(MAKE) -C openfst-$(OPENFST_VERSION) install MAKEOVERRIDES=
 
 openfst-$(OPENFST_VERSION)/lib: | openfst-$(OPENFST_VERSION)/Makefile
 	-cd openfst-$(OPENFST_VERSION) && [ -d lib64 ] && [ ! -d lib ] && ln -s lib64 lib
 
 # Add the -O flag to CXXFLAGS on cygwin as it can fix the compilation error
 # "file too big".
-openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION)/.patched | check_required_programs
+openfst-$(OPENFST_VERSION)/Makefile: openfst-$(OPENFST_VERSION) | check_required_programs
 # Note: OSTYPE path is probably dead for latest cygwin64 (installed on 2016/11/11).
 ifeq ($(OSTYPE),cygwin)
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
@@ -85,26 +72,14 @@ ifeq ($(OSTYPE),cygwin)
 else ifeq ($(OS),Windows_NT)
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS) -O -Wa,-mbig-obj" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 else
-	# ppc64le needs the newsted config.guess to be correctly indentified
-	[ "$(shell uname -p)" = "ppc64le" ] && wget -O openfst-$(OPENFST_VERSION)/config.guess \
-		"http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD" || \
-		echo "config.guess unchanged"
 	cd openfst-$(OPENFST_VERSION)/; ./configure --prefix=`pwd` --enable-static --enable-shared --enable-far --enable-ngram-fsts CXX=$(CXX) CXXFLAGS="$(CXXFLAGS)" LDFLAGS="$(LDFLAGS)" LIBS="-ldl"
 endif
 
-# patches for openfst. openfst_gcc41up.patch is a patch for openfst to \
-# support multi-threads when compile with g++ (gcc) version above 4.1
-openfst-$(OPENFST_VERSION)/.patched: | openfst-$(OPENFST_VERSION)
-	cd openfst-$(OPENFST_VERSION)/; \
-	patch -p1 -N < ../extras/openfst-$(OPENFST_VERSION).patch;
-	$(CXX) -dumpversion | awk '{if(NR==1 && $$1>"4.1") print "cd openfst-$(OPENFST_VERSION)/src/include/fst; patch -c -p0 -N < ../../../../extras/openfst_gcc41up.patch"}' | sh -
-	touch $@
-
 openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz
 	tar xozf openfst-$(OPENFST_VERSION).tar.gz
 
 openfst-$(OPENFST_VERSION).tar.gz:
-	wget --tries=1 -T 5 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
+	wget -T 10 -t 1 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \
 	wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz
 
 sclite: sclite_compiled
@@ -164,7 +139,7 @@ fortran_opt = $(shell gcc -v 2>&1 | perl -e '$$x = join(" ", <STDIN>); if($$x =~
 
 
 # note: you can uncomment the line that has USE_THREAD=1 and comment the line
-# that has USE_THREADE=0 if you want Open Blas to use multiple threads.  then
+# that has USE_THREAD=0 if you want Open Blas to use multiple threads.  then
 # you could set, for example, OPENBLAS_NUM_THREADS=2 in your path.sh so that the
 # runtime knows how many threads to use.  Note: if you ever get the error
 # "Program is Terminated. Because you tried to allocate too many memory
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 3e2ea50d685..fbc4b674474 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -1,5 +1,5 @@
-# we assume KALDI_ROOT is already defined 
-[ -z "$KALDI_ROOT" ] && echo "The variable KALDI_ROOT must be already defined" && exit 1
+# we assume KALDI_ROOT is already defined
+[ -z "$KALDI_ROOT" ] && echo >&2 "The variable KALDI_ROOT must be already defined" && exit 1
 # The formatting of the path export command is intentionally weird, because
 # this allows for easy diff'ing
 export PATH=\
diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh
index f45402e810e..43579334c89 100755
--- a/tools/extras/check_dependencies.sh
+++ b/tools/extras/check_dependencies.sh
@@ -1,4 +1,7 @@
-#!/bin/bash
+#!/usr/bin/env bash
+
+CXX=${CXX:-g++}
+status=0
 
 # at some point we could try to add packages for Cywgin or macports(?) to this
 # script.
@@ -19,9 +22,37 @@ if ! which which >&/dev/null; then
   add_packages which debianutils which
 fi
 
-if ! which g++ >&/dev/null; then
-  echo "$0: g++ is not installed."
-  add_packages gcc-c++ g++ gcc-c++
+if ! which $CXX >&/dev/null; then
+  echo "$0: $CXX is not installed."
+  echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+  status=1
+else
+  COMPILER_VER_INFO=$($CXX --version 2>/dev/null)
+  if [[ $COMPILER_VER_INFO == *"g++"* ]]; then
+    GCC_VER=$($CXX -dumpversion)
+    GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d")
+    if [ $GCC_VER_NUM -lt 40700 ]; then
+      echo "$0: $CXX (g++-$GCC_VER) is not supported."
+      echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      status=1
+    fi
+  elif [[ $COMPILER_VER_INFO == *"Apple"* ]]; then
+    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/")
+    if [ $CLANG_VER_NUM -lt 500 ]; then
+      echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported."
+      echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      status=1
+    fi
+  elif [[ $COMPILER_VER_INFO == *"LLVM"* ]]; then
+    CLANG_VER=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*version \([0-9\.]*\).*/\1/")
+    CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d")
+    if [ $CLANG_VER_NUM -lt 303 ]; then
+      echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported."
+      echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3."
+      status=1
+    fi
+  fi
 fi
 
 if ! echo "#include <zlib.h>" | gcc -E - >&/dev/null; then
@@ -132,7 +163,7 @@ fi
 if [ ! -z "$debian_packages" ]; then
   # If the list of packages to be installed is nonempty,
   # we'll exit with error status.  Check this outside of
-  # hecking for yum or apt-get, as we want it to exit with
+  # checking for yum or apt-get, as we want it to exit with
   # error even if we're not on Debian or red hat.
   status=1
 fi
@@ -150,14 +181,6 @@ if which grep >&/dev/null && pwd | grep -E 'JOB|LMWT' >/dev/null; then
   status=1;
 fi
 
-if [ -f /usr/lib64/libfst.so.1 ] || [ -f /usr/local/include/fst.h ] || \
-   [ -f /usr/include/fst/fst.h ] || [ -f /usr/local/bin/fstinfo ]; then
-  echo "*** $0: Kaldi cannot be installed (for now) if you have OpenFst"
-  echo "***   installed in system space (version mismatches, etc.)"
-  echo "***   Please try to uninstall it."
-  status=1
-fi
-
 if ! $printed && [ $status -eq 0 ]; then
   echo "$0: all OK."
 fi
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index 1bd9aea4aaa..8b0f8b6519e 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -12,14 +12,13 @@ errcho() { echo "$@" 1>&2; }
 errcho "****() Installing IRSTLM"
 
 if [ ! -x ./irstlm ] ; then
-  svn=`which svn`
+  svn=`which git`
   if [ $? != 0 ]  ; then
-    errcho "****() You need to have svn (subversion) installed"
+    errcho "****() You need to have git installed"
     exit 1
   fi
   (
-    svn -r 618 co --non-interactive --trust-server-cert \
-      https://svn.code.sf.net/p/irstlm/code/trunk irstlm
+    git clone https://github.com/irstlm-team/irstlm.git irstlm
   ) || {
     errcho "****() Error getting the IRSTLM sources. The server hosting it"
     errcho "****() might be down."
@@ -44,15 +43,16 @@ fi
 ) || {
   errcho "***() Error compiling IRSTLM. The error messages could help you "
   errcho "***() in figuring what went wrong."
+  exit 1
 }
 
 (
-  [ ! -z ${IRSTLM} ] && \
+  [ ! -z "${IRSTLM}" ] && \
     echo >&2 "IRSTLM variable is aleady defined. Undefining..." && \
     unset IRSTLM
 
   [ -f ./env.sh ] && . ./env.sh
-  [ ! -z ${IRSTLM} ] && \
+  [ ! -z "${IRSTLM}" ] && \
     echo >&2 "IRSTLM config is already in env.sh" && exit
 
   wd=`pwd -P`
diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh
index 7e6589b160d..10f72cad84f 100644
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@@ -14,19 +14,19 @@ make -i install
 cd ..
 
 (
-  [ ! -z ${LIBLBFGS} ] && \
+  [ ! -z "${LIBLBFGS}" ] && \
     echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
     unset LIBLBFGS
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${LIBLBFGS} ] && \
+  [ ! -z "${LIBLBFGS}" ] && \
     echo >&2 "libLBFGS config is already in env.sh" && exit
 
   wd=`pwd`
   wd=`readlink -f $wd || pwd`
 
   echo "export LIBLBFGS=$wd/liblbfgs-1.10"
-  echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH}':'${LIBLBFGS}'/lib/.libs
+  echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
 ) >> env.sh
 
diff --git a/tools/extras/install_mpg123.sh b/tools/extras/install_mpg123.sh
index 870275c6a10..5702ff476b4 100755
--- a/tools/extras/install_mpg123.sh
+++ b/tools/extras/install_mpg123.sh
@@ -55,13 +55,13 @@ ln -s mpg123-1.21.0  mpg123
 
 (
   set +u
-  [ ! -z ${MPG123} ] && \
+  [ ! -z "${MPG123}" ] && \
     echo >&2 "MPG123 variable is aleady defined. Undefining..." && \
     unset MPG123
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${MPG123} ] && \
+  [ ! -z "${MPG123}" ] && \
     echo >&2 "MPG123 config is already in env.sh" && exit
 
   wd=`pwd`
diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index 50ec7e98b5e..ba6d028edad 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -76,13 +76,13 @@ cd ../
 
 (
   set +u
-  [ ! -z ${SEQUITUR} ] && \
+  [ ! -z "${SEQUITUR}" ] && \
     echo >&2 "SEQUITUR variable is aleady defined. Undefining..." && \
     unset SEQUITUR
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${SEQUITUR} ] && \
+  [ ! -z "${SEQUITUR}" ] && \
     echo >&2 "SEQUITUR config is already in env.sh" && exit
 
   wd=`pwd`
@@ -91,7 +91,7 @@ cd ../
   echo "export SEQUITUR=$wd/sequitur-g2p"
   echo "export PATH=\$PATH:\${SEQUITUR}/bin"
   echo "_site_packages=\`find \${SEQUITUR}/lib -type d -regex '.*python.*/site-packages'\`"
-  echo "export PYTHONPATH=\$PYTHONPATH:\$_site_packages"
+  echo "export PYTHONPATH=\${PYTHONPATH:-}:\$_site_packages"
 ) >> env.sh
 
 echo >&2 "Installation of SEQUITUR finished successfully"
diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh
index 5d709e8a38b..000b1dbe6c5 100755
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@@ -61,13 +61,13 @@ make || exit 1
 
 cd ..
 (
-  [ ! -z ${SRILM} ] && \
+  [ ! -z "${SRILM}" ] && \
     echo >&2 "SRILM variable is aleady defined. Undefining..." && \
     unset SRILM
 
   [ -f ./env.sh ] && . ./env.sh
 
-  [ ! -z ${SRILM} ] && \
+  [ ! -z "${SRILM}" ] && \
     echo >&2 "SRILM config is already in env.sh" && exit
 
   wd=`pwd`
diff --git a/tools/extras/openfst-1.3.4.patch b/tools/extras/openfst-1.3.4.patch
deleted file mode 100644
index 41ce6d59221..00000000000
--- a/tools/extras/openfst-1.3.4.patch
+++ /dev/null
@@ -1,395 +0,0 @@
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
-
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
-
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
-
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
-
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
-
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
-
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
-
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
-
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
-
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
-
-
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
-
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
-
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
-
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
-
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
-
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
-
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
-
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
-
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
-
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
-
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
-
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
-
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
-
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
-
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
-
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) {
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
-
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
-
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
-+        split_el->prev->next = 0;
-+        split_el->prev = 0;
-         class_size_[class_id] = split_size_[class_id];
-         class_size_[new_class] = remainder;
--        split_el->prev->next = 0;
--        split_el->prev = 0;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
-
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
-
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
-
-
---- a/src/script/text-io.cc
-+++ b/src/script/text-io.cc
-@@ -84,7 +84,7 @@ bool WritePotentials(const string& filename,
-   if (!*strm)
-     LOG(ERROR) << "WritePotentials: Write failed: "
-                << (filename.empty() ? "standard output" : filename);
--  bool ret = *strm;
-+  bool ret = !strm->fail();
-   if (strm != &cout)
-     delete strm;
-   return ret;
-
---- a/src/include/fst/extensions/ngram/ngram-fst.h
-+++ b/src/include/fst/extensions/ngram/ngram-fst.h
-@@ -130,7 +130,7 @@
-     hdr.SetNumStates(num_states_);
-     WriteHeader(strm, opts, kFileVersion, &hdr);
-     strm.write(data_, Storage(num_states_, num_futures_, num_final_));
-+    return !strm.fail();
--    return strm;
-   }
-
-   StateId Start() const {
diff --git a/tools/extras/openfst-1.4.1.patch b/tools/extras/openfst-1.4.1.patch
deleted file mode 100644
index 5889191d1a0..00000000000
--- a/tools/extras/openfst-1.4.1.patch
+++ /dev/null
@@ -1,153 +0,0 @@
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
-
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
-
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
-
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, GALLIC_LEFT> > gfst;
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
-
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
-
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
-
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
-
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
-
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
-
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) {
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
-
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
-
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
-+        split_el->prev->next = 0;
-+        split_el->prev = 0;
-         class_size_[class_id] = split_size_[class_id];
-         class_size_[new_class] = remainder;
--        split_el->prev->next = 0;
--        split_el->prev = 0;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
-
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
-
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
-
-
diff --git a/tools/extras/openfst_gcc41up.patch b/tools/extras/openfst_gcc41up.patch
deleted file mode 100644
index 2a47c9b9bd0..00000000000
--- a/tools/extras/openfst_gcc41up.patch
+++ /dev/null
@@ -1,28 +0,0 @@
-*** lock.h
-***************
-*** 78,85 ****
-    RefCounter() : count_(1) {}
-
-    int count() const { return count_; }
-!   int Incr() const { return ++count_; }
-!   int Decr() const {  return --count_; }
-
-   private:
-    mutable int count_;
---- 78,93 ----
-    RefCounter() : count_(1) {}
-
-    int count() const { return count_; }
-!
-! // below lines are modifications of openfst for multi-thrads support,
-! // from tools/extras/openfst_gcc41up.patch, applied by tools/Makefile,
-! // applicable to gcc 4.1 or above
-!   // int Incr() const { return ++count_; }
-!   // int Decr() const {  return --count_; }
-!
-!   int Incr() const { return __sync_add_and_fetch(&count_, 1); }
-!   int Decr() const { return __sync_sub_and_fetch(&count_, 1); }
-! // end modifications
-
-   private:
-    mutable int count_;
diff --git a/tools/extras/openfstwin-1.3.4.patch b/tools/extras/openfstwin-1.3.4.patch
deleted file mode 100644
index 2fbb1d1fc27..00000000000
--- a/tools/extras/openfstwin-1.3.4.patch
+++ /dev/null
@@ -1,421 +0,0 @@
-diff --git a/src/include/fst/compat.h b/src/include/fst/compat.h
-index 00e2dba..ff8bacc 100644
---- a/src/include/fst/compat.h
-+++ b/src/include/fst/compat.h
-@@ -37,7 +39,7 @@ typedef SSIZE_T ssize_t;
- 		  #pragma comment (lib, "openfst64.lib")
-     #else
-       #pragma comment (lib, "openfst.lib")
--    #endif		
-+    #endif
- 	#endif
- #endif
- #else
-diff --git a/src/include/fst/interval-set.h b/src/include/fst/interval-set.h
-index c4362f2..58cad44 100644
---- a/src/include/fst/interval-set.h
-+++ b/src/include/fst/interval-set.h
-@@ -37,38 +37,38 @@ template <typename T>
- class IntervalSet {
-  public:
-   struct Interval {
--    T begin;
--    T end;
-+    T begin_;
-+    T end_;
- 
--    Interval() : begin(-1), end(-1) {}
-+    Interval() : begin_(-1), end_(-1) {}
- 
--    Interval(T b, T e) : begin(b), end(e) {}
-+    Interval(T b, T e) : begin_(b), end_(e) {}
- 
-     bool operator<(const Interval &i) const {
--      return begin < i.begin || (begin == i.begin && end > i.end);
-+      return begin_ < i.begin_ || (begin_ == i.begin_ && end_ > i.end_);
-     }
- 
-     bool operator==(const Interval &i) const {
--      return begin == i.begin && end == i.end;
-+      return begin_ == i.begin_ && end_ == i.end_;
-     }
- 
-     bool operator!=(const Interval &i) const {
--      return begin != i.begin || end != i.end;
-+      return begin_ != i.begin_ || end_ != i.end_;
-     }
- 
-     istream &Read(istream &strm) {
-       T n;
-       ReadType(strm, &n);
--      begin = n;
-+      begin_ = n;
-       ReadType(strm, &n);
--      end = n;
-+      end_ = n;
-       return strm;
-     }
- 
-     ostream &Write(ostream &strm) const {
--      T n = begin;
-+      T n = begin_;
-       WriteType(strm, n);
--      n = end;
-+      n = end_;
-       WriteType(strm, n);
-       return strm;
-     }
-@@ -108,7 +108,7 @@ class IntervalSet {
-         lower_bound(intervals_.begin(), intervals_.end(), interval);
-     if (lb == intervals_.begin())
-       return false;
--    return (--lb)->end > value;
-+    return (--lb)->end_ > value;
-   }
- 
-   // Requires intervals be normalized.
-@@ -123,7 +123,7 @@ class IntervalSet {
- 
-   bool Singleton() const {
-     return intervals_.size() == 1 &&
--        intervals_[0].begin + 1 == intervals_[0].end;
-+        intervals_[0].begin_ + 1 == intervals_[0].end_;
-   }
- 
- 
-@@ -178,17 +178,17 @@ void IntervalSet<T>::Normalize() {
-   T size = 0;
-   for (T i = 0; i < intervals_.size(); ++i) {
-     Interval &inti = intervals_[i];
--    if (inti.begin == inti.end)
-+    if (inti.begin_ == inti.end_)
-       continue;
-     for (T j = i + 1; j < intervals_.size(); ++j) {
-       Interval &intj = intervals_[j];
--      if (intj.begin > inti.end)
-+      if (intj.begin_ > inti.end_)
-         break;
--      if (intj.end > inti.end)
--        inti.end = intj.end;
-+      if (intj.end_ > inti.end_)
-+        inti.end_ = intj.end_;
-       ++i;
-     }
--    count_ += inti.end - inti.begin;
-+    count_ += inti.end_ - inti.begin_;
-     intervals_[size++] = inti;
-   }
-   intervals_.resize(size);
-@@ -208,17 +208,17 @@ void IntervalSet<T>::Intersect(const IntervalSet<T> &iset,
-   oset->count_ = 0;
- 
-   while (it1 != intervals_.end() && it2 != iintervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       Interval interval;
--      interval.begin = max(it1->begin, it2->begin);
--      interval.end = min(it1->end, it2->end);
-+      interval.begin_ = max(it1->begin_, it2->begin_);
-+      interval.end_ = min(it1->end_, it2->end_);
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
--      if (it1->end < it2->end)
-+      oset->count_ += interval.end_ - interval.begin_;
-+      if (it1->end_ < it2->end_)
-         ++it1;
-       else
-         ++it2;
-@@ -235,21 +235,21 @@ void IntervalSet<T>::Complement(T maxval, IntervalSet<T> *oset) const {
-   oset->count_ = 0;
- 
-   Interval interval;
--  interval.begin = 0;
-+  interval.begin_ = 0;
-   for (typename vector<Interval>::const_iterator it = intervals_.begin();
-        it != intervals_.end();
-        ++it) {
--    interval.end = min(it->begin, maxval);
--    if (interval.begin < interval.end) {
-+    interval.end_ = min(it->begin_, maxval);
-+    if (interval.begin_ < interval.end_) {
-       ointervals->push_back(interval);
--      oset->count_ += interval.end - interval.begin;
-+      oset->count_ += interval.end_ - interval.begin_;
-     }
--    interval.begin = it->end;
-+    interval.begin_ = it->end_;
-   }
--  interval.end = maxval;
--  if (interval.begin < interval.end) {
-+  interval.end_ = maxval;
-+  if (interval.begin_ < interval.end_) {
-     ointervals->push_back(interval);
--    oset->count_ += interval.end - interval.begin;
-+    oset->count_ += interval.end_ - interval.begin_;
-   }
- }
- 
-@@ -263,7 +263,7 @@ void IntervalSet<T>::Difference(const IntervalSet<T> &iset,
-     oset->count_ = 0;
-   } else {
-     IntervalSet<T> cset;
--    iset.Complement(intervals_.back().end, &cset);
-+    iset.Complement(intervals_.back().end_, &cset);
-     Intersect(cset, oset);
-   }
- }
-@@ -277,9 +277,9 @@ bool IntervalSet<T>::Overlaps(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {
-+    if (it1->end_ <= it2->begin_) {
-       ++it1;
--    } else if (it2->end <= it1->begin) {
-+    } else if (it2->end_ <= it1->begin_) {
-       ++it2;
-     } else {
-       return true;
-@@ -300,21 +300,21 @@ bool IntervalSet<T>::StrictlyOverlaps(const IntervalSet<T> &iset) const {
-   bool overlap = false; // point in both intervals_ and intervals
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       only1 = true;
-       ++it1;
--    } else if (it2->end <= it1->begin) {  // no overlap - it2 first
-+    } else if (it2->end_ <= it1->begin_) {  // no overlap - it2 first
-       only2 = true;
-       ++it2;
--    } else if (it2->begin == it1->begin && it2->end == it1->end) {  // equals
-+    } else if (it2->begin_ == it1->begin_ && it2->end_ == it1->end_) {  // equals
-       overlap = true;
-       ++it1;
-       ++it2;
--    } else if (it2->begin <= it1->begin && it2->end >= it1->end) {  // 1 c 2
-+    } else if (it2->begin_ <= it1->begin_ && it2->end_ >= it1->end_) {  // 1 c 2
-       only2 = true;
-       overlap = true;
-       ++it1;
--    } else if (it1->begin <= it2->begin && it1->end >= it2->end) {  // 2 c 1
-+    } else if (it1->begin_ <= it2->begin_ && it1->end_ >= it2->end_) {  // 2 c 1
-       only1 = true;
-       overlap = true;
-       ++it2;
-@@ -346,11 +346,11 @@ bool IntervalSet<T>::Contains(const IntervalSet<T> &iset) const {
-   typename vector<Interval>::const_iterator it2 = intervals->begin();
- 
-   while (it1 != intervals_.end() && it2 != intervals->end()) {
--    if (it1->end <= it2->begin) {  // no overlap - it1 first
-+    if (it1->end_ <= it2->begin_) {  // no overlap - it1 first
-       ++it1;
--    } else if (it2->begin < it1->begin || it2->end > it1->end) {  // no C
-+    } else if (it2->begin_ < it1->begin_ || it2->end_ > it1->end_) {  // no C
-       return false;
--    } else if (it2->end == it1->end) {
-+    } else if (it2->end_ == it1->end_) {
-       ++it1;
-       ++it2;
-     } else {
-@@ -370,7 +370,7 @@ ostream &operator<<(ostream &strm, const IntervalSet<T> &s)  {
-        ++it) {
-     if (it != intervals->begin())
-       strm << ",";
--    strm << "[" << it->begin << "," << it->end << ")";
-+    strm << "[" << it->begin_ << "," << it->end_ << ")";
-   }
-   strm << "}";
-   return strm;
-diff --git a/src/include/fst/label-reachable.h b/src/include/fst/label-reachable.h
-index a7c3360..491ef7d 100644
---- a/src/include/fst/label-reachable.h
-+++ b/src/include/fst/label-reachable.h
-@@ -359,9 +359,9 @@ class LabelReachable {
-                iiter = intervals->begin();
-            iiter != intervals->end(); ++iiter) {
-         begin_low = LowerBound(aiter, end_low, aiter_end,
--                               aiter_input, iiter->begin);
-+                               aiter_input, iiter->begin_);
-         end_low = LowerBound(aiter, begin_low, aiter_end,
--                             aiter_input, iiter->end);
-+                             aiter_input, iiter->end_);
-         if (end_low - begin_low > 0) {
-           if (reach_begin_ < 0)
-             reach_begin_ = begin_low;
-diff --git a/src/include/fst/minimize.h b/src/include/fst/minimize.h
-index 3fbe3ba..6e9dd3d 100644
---- a/src/include/fst/minimize.h
-+++ b/src/include/fst/minimize.h
-@@ -134,7 +134,14 @@ class CyclicMinimizer {
-   typedef typename A::Weight Weight;
-   typedef ReverseArc<A> RevA;
- 
--  CyclicMinimizer(const ExpandedFst<A>& fst) {
-+  CyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      P_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // this minimization
-+    // algorithm for non-deterministic FSTs can only work with idempotent
-+    // semirings.
-     Initialize(fst);
-     Compute(fst);
-   }
-@@ -315,7 +322,13 @@ class AcyclicMinimizer {
-   typedef typename A::StateId ClassId;
-   typedef typename A::Weight Weight;
- 
--  AcyclicMinimizer(const ExpandedFst<A>& fst) {
-+  AcyclicMinimizer(const ExpandedFst<A>& fst):
-+      // tell the Partition data-member to expect multiple repeated
-+      // calls to SplitOn with the same element if we are non-deterministic.
-+      partition_(fst.Properties(kIDeterministic, true) == 0) {
-+    if(fst.Properties(kIDeterministic, true) == 0)
-+      CHECK(Weight::Properties() & kIdempotent); // minimization for
-+    // non-deterministic FSTs can only work with idempotent semirings.
-     Initialize(fst);
-     Refine(fst);
-   }
-@@ -531,13 +544,7 @@ template <class A>
- void Minimize(MutableFst<A>* fst,
-               MutableFst<A>* sfst = 0,
-               float delta = kDelta) {
--  uint64 props = fst->Properties(kAcceptor | kIDeterministic|
--                                 kWeighted | kUnweighted, true);
--  if (!(props & kIDeterministic)) {
--    FSTERROR() << "FST is not deterministic";
--    fst->SetProperties(kError, kError);
--    return;
--  }
-+  uint64 props = fst->Properties(kAcceptor | kWeighted | kUnweighted, true);
- 
-   if (!(props & kAcceptor)) {  // weighted transducer
-     VectorFst< GallicArc<A, STRING_LEFT> > gfst;
-diff --git a/src/include/fst/partition.h b/src/include/fst/partition.h
-index dcee67b..40b849a 100644
---- a/src/include/fst/partition.h
-+++ b/src/include/fst/partition.h
-@@ -43,8 +43,8 @@ class Partition {
-   friend class PartitionIterator<T>;
- 
-   struct Element {
--   Element() : value(0), next(0), prev(0) {}
--   Element(T v) : value(v), next(0), prev(0) {}
-+    Element() : value(0), next(0), prev(0) {}
-+    Element(T v) : value(v), next(0), prev(0) {}
- 
-    T        value;
-    Element* next;
-@@ -52,9 +52,11 @@ class Partition {
-   };
- 
-  public:
--  Partition() {}
-+  Partition(bool allow_repeated_split):
-+      allow_repeated_split_(allow_repeated_split) {}
- 
--  Partition(T num_states) {
-+  Partition(bool allow_repeated_split, T num_states):
-+      allow_repeated_split_(allow_repeated_split) {
-     Initialize(num_states);
-   }
- 
-@@ -137,16 +139,16 @@ class Partition {
-     if (class_size_[class_id] == 1) return;
- 
-     // first time class is split
--    if (split_size_[class_id] == 0)
-+    if (split_size_[class_id] == 0) { 
-       visited_classes_.push_back(class_id);
--
-+      class_split_[class_id] = classes_[class_id];
-+    }
-     // increment size of split (set of element at head of chain)
-     split_size_[class_id]++;
--
-+    
-     // update split point
--    if (class_split_[class_id] == 0)
--      class_split_[class_id] = classes_[class_id];
--    if (class_split_[class_id] == elements_[element_id])
-+    if (class_split_[class_id] != 0
-+        && class_split_[class_id] == elements_[element_id])
-       class_split_[class_id] = elements_[element_id]->next;
- 
-     // move to head of chain in same class
-@@ -157,24 +159,31 @@ class Partition {
-   // class indices of the newly created class. Returns the new_class id
-   // or -1 if no new class was created.
-   T SplitRefine(T class_id) {
-+
-+    Element* split_el = class_split_[class_id];
-     // only split if necessary
--    if (class_size_[class_id] == split_size_[class_id]) {
--      class_split_[class_id] = 0;
-+    //if (class_size_[class_id] == split_size_[class_id]) {
-+    if(split_el == NULL) { // we split on everything...
-       split_size_[class_id] = 0;
-       return -1;
-     } else {
--
-       T new_class = AddClass();
-+
-+      if(allow_repeated_split_) { // split_size_ is possibly
-+        // inaccurate, so work it out exactly.
-+        size_t split_count;  Element *e;
-+        for(split_count=0,e=classes_[class_id];
-+            e != split_el; split_count++, e=e->next);
-+        split_size_[class_id] = split_count;
-+      }
-       size_t remainder = class_size_[class_id] - split_size_[class_id];
-       if (remainder < split_size_[class_id]) {  // add smaller
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = split_el;
--        class_size_[class_id] = split_size_[class_id];
--        class_size_[new_class] = remainder;
-         split_el->prev->next = 0;
-         split_el->prev = 0;
-+        class_size_[class_id] = split_size_[class_id];
-+        class_size_[new_class] = remainder;
-       } else {
--        Element* split_el   = class_split_[class_id];
-         classes_[new_class] = classes_[class_id];
-         class_size_[class_id] = remainder;
-         class_size_[new_class] = split_size_[class_id];
-@@ -245,10 +254,16 @@ class Partition {
-   vector<T> class_size_;
- 
-   // size of split for each class
-+  // in the nondeterministic case, split_size_ is actually an upper
-+  // bound on the size of split for each class.
-   vector<T> split_size_;
- 
-   // set of visited classes to be used in split refine
-   vector<T> visited_classes_;
-+
-+  // true if input fst was deterministic: we can make
-+  // certain assumptions in this case that speed up the algorithm.
-+  bool allow_repeated_split_;
- };
- 
- 
-diff --git a/src/include/fst/state-reachable.h b/src/include/fst/state-reachable.h
-index 6d0c971..1da922e 100644
---- a/src/include/fst/state-reachable.h
-+++ b/src/include/fst/state-reachable.h
-@@ -112,7 +112,7 @@ class IntervalReachVisitor {
-   void FinishState(StateId s, StateId p, const A *arc) {
-     if (index_ >= 0 && fst_.Final(s) != Weight::Zero()) {
-       vector<Interval> *intervals = (*isets_)[s].Intervals();
--      (*intervals)[0].end = index_;      // Update tree interval end
-+      (*intervals)[0].end_ = index_;      // Update tree interval end
-     }
-     (*isets_)[s].Normalize();
-     if (p != kNoStateId)
diff --git a/tools/extras/travis_script.sh b/tools/extras/travis_script.sh
index a857f538edd..d1b9049ef22 100755
--- a/tools/extras/travis_script.sh
+++ b/tools/extras/travis_script.sh
@@ -4,12 +4,12 @@
 # Typical usage shown below; any one can be safely left unset.
 #   INCDIRS="~/xroot/usr/include"
 #   LIBDIRS="~/xroot/usr/lib /usr/lib/openblas-base"
-#   CXX=gcc++-4.9
+#   CXX=clang++-3.8
 #   CFLAGS="-march=native -O2"
 #   LDFLAGS="-llapack"
 
 # Maximum make parallelism. Simply -j runs out of memory on Travis VM.
-MAXPAR=3
+MAXPAR=6
 
 # Directories with code that can be tested with Travis (space-separated)
 TESTABLE_DIRS="src/"
@@ -17,7 +17,7 @@ TESTABLE_DIRS="src/"
 # Run verbose (run and echo) and exit if failed.
 runvx() {
   echo "\$ $@"
-  "$@" || exit 1
+  eval "$@" || exit 1
 }
 
 # $(addsw -L foo bar) => "-Lfoo -Lbar".
@@ -28,7 +28,7 @@ addsw() {
 }
 
 # $(mtoken CXX gcc) => "CXX=gcc"; # $(mtoken CXX ) => "".
-mtoken() { echo ${2+$1=$2}; }
+mtoken() { echo ${2+$1=\"$2\"}; }
 
 # Print machine info and environment.
 runvx uname -a
@@ -38,30 +38,41 @@ runvx env
 # However, do run tests if TRAVIS_COMMIT_RANGE does not parse. This
 # most likely means the branch was reset by --force; re-run tests then.
 if git rev-parse "${TRAVIS_COMMIT_RANGE}" >/dev/null 2>&1 && \
-   ! git diff --name-only "${TRAVIS_COMMIT_RANGE}" -- ${TESTABLE_DIRS} | read REPLY
+   ! git diff --name-only "${TRAVIS_COMMIT_RANGE}" -- ${TESTABLE_DIRS} \
+   .travis.yml tools/extras/travis_*.sh | read REPLY
 then
   echo; echo "No changes outside ${TESTABLE_DIRS} in the commit" \
              "range ${TRAVIS_COMMIT_RANGE}; reporting success."
   exit 0;
 fi
 
-# Prepare make command fragments.
-CF="$CFLAGS -g $(addsw -I $INCDIRS)"
-LDF="$LDFLAGS $(addsw -L $LIBDIRS)"
-CCC="$(mtoken CC $CXX) $(mtoken CXX $CXX)"
+# Prepare environment variables
+CF="\"$CFLAGS -g $(addsw -I $INCDIRS)\""
+LDF="\"$LDFLAGS $(addsw -L $LIBDIRS)\""
+CCC="$(mtoken CXX "$CXX")"
 
+# Randomly choose between single and double precision
+if [[ $(( RANDOM % 2 )) == 1 ]] ; then
+  DPF="--double-precision=yes"
+else
+  DPF="--double-precision=no"
+fi
+
+echo "Building tools..." [Time: $(date)]
 runvx cd tools
-runvx make openfst $CCC CXXFLAGS="$CF" -j$MAXPAR
+runvx make openfst "$CCC" CXXFLAGS="$CF" -j$MAXPAR
 cd ..
+
+echo "Building src..." [Time: $(date)]
 runvx cd src
-runvx ./configure --shared --use-cuda=no  --mathlib=OPENBLAS --openblas-root=$XROOT/usr
+runvx "$CCC" CXXFLAGS="$CF" LDFLAGS="$LDF" ./configure --shared --use-cuda=no "$DPF" --mathlib=OPENBLAS --openblas-root="$XROOT/usr"
+runvx make all -j$MAXPAR
+runvx make ext -j$MAXPAR
 
-make_kaldi() {
-  runvx make "$@" $CCC EXTRA_CXXFLAGS="$CF" EXTRA_LDLIBS="$LDF"
-}
+echo "Running tests..." [Time: $(date)]
+runvx make test -k -j$MAXPAR
 
-#make_kaldi mklibdir base matrix -j$MAXPAR
-#make_kaldi matrix/test
+echo "Done." [Time: $(date)]
 
-make_kaldi all -j$MAXPAR
-make_kaldi test -k
+#runvx make mklibdir base matrix -j$MAXPAR
+#runvx make matrix/test
diff --git a/windows/get_version.pl b/windows/get_version.pl
index f66a3a23c25..98d4a6b49e6 100755
--- a/windows/get_version.pl
+++ b/windows/get_version.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 #===============================================================================
 # Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -39,6 +39,6 @@
 };
 
 my $kaldi_ver=<$F>; chomp $kaldi_ver;
-print $H  "#define KALDI_VERSION \"${kaldi_ver}-win\"\n";
-close($F);
+print $H  "#define KALDI_VERSION \"${kaldi_ver}-win\"\n";
+close($F);
 close($H);